use regex::Regex;
use scraper::{Html, Selector};
use url::Url;
use crate::types::assets::{ImageAsset, ImageSourceType, SrcsetEntry};
pub struct ImageExtractor {
resolve_lazy: bool,
extract_css_backgrounds: bool,
}
impl Default for ImageExtractor {
fn default() -> Self {
Self {
resolve_lazy: true,
extract_css_backgrounds: false,
}
}
}
impl ImageExtractor {
pub fn new() -> Self {
Self::default()
}
pub fn with_options(mut self, resolve_lazy: bool, extract_css_backgrounds: bool) -> Self {
self.resolve_lazy = resolve_lazy;
self.extract_css_backgrounds = extract_css_backgrounds;
self
}
pub fn extract(&self, html: &str, base_url: &Url) -> Vec<ImageAsset> {
let document = Html::parse_document(html);
let mut images = Vec::new();
images.extend(self.extract_img_tags(&document, base_url));
images.extend(self.extract_picture_tags(&document, base_url));
if self.extract_css_backgrounds {
images.extend(self.extract_css_backgrounds_from_style(&document, base_url));
}
images.sort_by(|a, b| a.url.as_str().cmp(b.url.as_str()));
images.dedup_by(|a, b| a.url == b.url);
images
}
fn extract_img_tags(&self, document: &Html, base_url: &Url) -> Vec<ImageAsset> {
let selector = Selector::parse("img").unwrap();
let mut images = Vec::new();
for img in document.select(&selector) {
let attrs = img.value();
let src = attrs.attr("src");
let lazy_src = if self.resolve_lazy {
attrs.attr("data-src")
.or_else(|| attrs.attr("data-lazy"))
.or_else(|| attrs.attr("data-original"))
.or_else(|| attrs.attr("loading") .filter(|_| attrs.attr("data-src").is_some()).and_then(|_| attrs.attr("data-src")))
} else {
None
};
let url_str = lazy_src.or(src);
let url = match url_str {
Some(s) if !s.is_empty() && !s.starts_with("data:") => {
base_url.join(s).ok()
}
_ => continue,
};
let url = match url {
Some(u) => u,
None => continue,
};
let alt = attrs.attr("alt").map(String::from);
let width = attrs.attr("width").and_then(|w| w.parse().ok());
let height = attrs.attr("height").and_then(|h| h.parse().ok());
let srcset = attrs.attr("srcset").map(|s| self.parse_srcset(s, base_url));
let lazy_src_url = if lazy_src.is_some() && src.is_some() {
lazy_src.and_then(|s| base_url.join(s).ok())
} else {
None
};
images.push(ImageAsset {
url,
alt,
width,
height,
srcset,
lazy_src: lazy_src_url,
file_size: None,
mime_type: None,
source_type: ImageSourceType::Img,
});
}
images
}
fn extract_picture_tags(&self, document: &Html, base_url: &Url) -> Vec<ImageAsset> {
let picture_selector = Selector::parse("picture").unwrap();
let source_selector = Selector::parse("source").unwrap();
let mut images = Vec::new();
for picture in document.select(&picture_selector) {
for source in picture.select(&source_selector) {
let attrs = source.value();
let srcset_str = attrs.attr("srcset").or_else(|| attrs.attr("src"));
if srcset_str.is_none() {
continue;
}
let srcset = self.parse_srcset(srcset_str.unwrap(), base_url);
if srcset.is_empty() {
continue;
}
let url = srcset[0].url.clone();
images.push(ImageAsset {
url,
alt: None,
width: None,
height: None,
srcset: Some(srcset),
lazy_src: None,
file_size: None,
mime_type: attrs.attr("type").map(String::from),
source_type: ImageSourceType::Picture,
});
}
}
images
}
fn parse_srcset(&self, srcset: &str, base_url: &Url) -> Vec<SrcsetEntry> {
srcset
.split(',')
.filter_map(|entry| {
let parts: Vec<&str> = entry.trim().split_whitespace().collect();
if parts.is_empty() {
return None;
}
let url = base_url.join(parts[0]).ok()?;
let descriptor = parts.get(1).map(|s| s.to_string()).unwrap_or_else(|| "1x".to_string());
Some(SrcsetEntry { url, descriptor })
})
.collect()
}
fn extract_css_backgrounds_from_style(&self, document: &Html, base_url: &Url) -> Vec<ImageAsset> {
let mut images = Vec::new();
let url_regex = Regex::new(r#"url\s*\(\s*['"]?([^'")\s]+)['"]?\s*\)"#).unwrap();
let all_selector = Selector::parse("[style]").unwrap();
for element in document.select(&all_selector) {
if let Some(style) = element.value().attr("style") {
for cap in url_regex.captures_iter(style) {
if let Some(url_match) = cap.get(1) {
let url_str = url_match.as_str();
if !url_str.starts_with("data:") {
if let Ok(url) = base_url.join(url_str) {
images.push(ImageAsset {
url,
alt: None,
width: None,
height: None,
srcset: None,
lazy_src: None,
file_size: None,
mime_type: None,
source_type: ImageSourceType::CssBackground,
});
}
}
}
}
}
}
images
}
}