halldyll_core/parse/
images.rs

1//! Images - Image extraction
2
3use regex::Regex;
4use scraper::{Html, Selector};
5use url::Url;
6
7use crate::types::assets::{ImageAsset, ImageSourceType, SrcsetEntry};
8
9/// Image extractor
10pub struct ImageExtractor {
11    /// Resolve lazy-loading (data-src, etc.)
12    resolve_lazy: bool,
13    /// Extract CSS background images
14    extract_css_backgrounds: bool,
15}
16
17impl Default for ImageExtractor {
18    fn default() -> Self {
19        Self {
20            resolve_lazy: true,
21            extract_css_backgrounds: false,
22        }
23    }
24}
25
26impl ImageExtractor {
27    /// New extractor
28    pub fn new() -> Self {
29        Self::default()
30    }
31
32    /// Configure options
33    pub fn with_options(mut self, resolve_lazy: bool, extract_css_backgrounds: bool) -> Self {
34        self.resolve_lazy = resolve_lazy;
35        self.extract_css_backgrounds = extract_css_backgrounds;
36        self
37    }
38
39    /// Extract all images
40    pub fn extract(&self, html: &str, base_url: &Url) -> Vec<ImageAsset> {
41        let document = Html::parse_document(html);
42        let mut images = Vec::new();
43
44        // 1. Images <img>
45        images.extend(self.extract_img_tags(&document, base_url));
46
47        // 2. Images <picture><source>
48        images.extend(self.extract_picture_tags(&document, base_url));
49
50        // 3. CSS background-image (optionnel)
51        if self.extract_css_backgrounds {
52            images.extend(self.extract_css_backgrounds_from_style(&document, base_url));
53        }
54
55        // Deduplicate by URL
56        images.sort_by(|a, b| a.url.as_str().cmp(b.url.as_str()));
57        images.dedup_by(|a, b| a.url == b.url);
58
59        images
60    }
61
62    /// Extract <img> tags
63    fn extract_img_tags(&self, document: &Html, base_url: &Url) -> Vec<ImageAsset> {
64        let selector = Selector::parse("img").unwrap();
65        let mut images = Vec::new();
66
67        for img in document.select(&selector) {
68            let attrs = img.value();
69
70            // Main source
71            let src = attrs.attr("src");
72            
73            // Lazy-loading sources
74            let lazy_src = if self.resolve_lazy {
75                attrs.attr("data-src")
76                    .or_else(|| attrs.attr("data-lazy"))
77                    .or_else(|| attrs.attr("data-original"))
78                    .or_else(|| attrs.attr("loading") .filter(|_| attrs.attr("data-src").is_some()).and_then(|_| attrs.attr("data-src")))
79            } else {
80                None
81            };
82
83            // Determine final URL
84            let url_str = lazy_src.or(src);
85            let url = match url_str {
86                Some(s) if !s.is_empty() && !s.starts_with("data:") => {
87                    base_url.join(s).ok()
88                }
89                _ => continue,
90            };
91
92            let url = match url {
93                Some(u) => u,
94                None => continue,
95            };
96
97            // Alt text
98            let alt = attrs.attr("alt").map(String::from);
99
100            // Dimensions
101            let width = attrs.attr("width").and_then(|w| w.parse().ok());
102            let height = attrs.attr("height").and_then(|h| h.parse().ok());
103
104            // Srcset
105            let srcset = attrs.attr("srcset").map(|s| self.parse_srcset(s, base_url));
106
107            // Lazy src en tant qu'URL
108            let lazy_src_url = if lazy_src.is_some() && src.is_some() {
109                lazy_src.and_then(|s| base_url.join(s).ok())
110            } else {
111                None
112            };
113
114            images.push(ImageAsset {
115                url,
116                alt,
117                width,
118                height,
119                srcset,
120                lazy_src: lazy_src_url,
121                file_size: None,
122                mime_type: None,
123                source_type: ImageSourceType::Img,
124            });
125        }
126
127        images
128    }
129
130    /// Extract <picture><source> tags
131    fn extract_picture_tags(&self, document: &Html, base_url: &Url) -> Vec<ImageAsset> {
132        let picture_selector = Selector::parse("picture").unwrap();
133        let source_selector = Selector::parse("source").unwrap();
134        let mut images = Vec::new();
135
136        for picture in document.select(&picture_selector) {
137            for source in picture.select(&source_selector) {
138                let attrs = source.value();
139
140                // srcset ou src
141                let srcset_str = attrs.attr("srcset").or_else(|| attrs.attr("src"));
142                if srcset_str.is_none() {
143                    continue;
144                }
145
146                let srcset = self.parse_srcset(srcset_str.unwrap(), base_url);
147                if srcset.is_empty() {
148                    continue;
149                }
150
151                // Take first entry as main URL
152                let url = srcset[0].url.clone();
153
154                images.push(ImageAsset {
155                    url,
156                    alt: None,
157                    width: None,
158                    height: None,
159                    srcset: Some(srcset),
160                    lazy_src: None,
161                    file_size: None,
162                    mime_type: attrs.attr("type").map(String::from),
163                    source_type: ImageSourceType::Picture,
164                });
165            }
166        }
167
168        images
169    }
170
171    /// Parse a srcset attribute
172    fn parse_srcset(&self, srcset: &str, base_url: &Url) -> Vec<SrcsetEntry> {
173        srcset
174            .split(',')
175            .filter_map(|entry| {
176                let parts: Vec<&str> = entry.trim().split_whitespace().collect();
177                if parts.is_empty() {
178                    return None;
179                }
180
181                let url = base_url.join(parts[0]).ok()?;
182                let descriptor = parts.get(1).map(|s| s.to_string()).unwrap_or_else(|| "1x".to_string());
183
184                Some(SrcsetEntry { url, descriptor })
185            })
186            .collect()
187    }
188
189    /// Extract CSS background-image images
190    fn extract_css_backgrounds_from_style(&self, document: &Html, base_url: &Url) -> Vec<ImageAsset> {
191        let mut images = Vec::new();
192        let url_regex = Regex::new(r#"url\s*\(\s*['"]?([^'")\s]+)['"]?\s*\)"#).unwrap();
193
194        // Styles inline
195        let all_selector = Selector::parse("[style]").unwrap();
196        for element in document.select(&all_selector) {
197            if let Some(style) = element.value().attr("style") {
198                for cap in url_regex.captures_iter(style) {
199                    if let Some(url_match) = cap.get(1) {
200                        let url_str = url_match.as_str();
201                        if !url_str.starts_with("data:") {
202                            if let Ok(url) = base_url.join(url_str) {
203                                images.push(ImageAsset {
204                                    url,
205                                    alt: None,
206                                    width: None,
207                                    height: None,
208                                    srcset: None,
209                                    lazy_src: None,
210                                    file_size: None,
211                                    mime_type: None,
212                                    source_type: ImageSourceType::CssBackground,
213                                });
214                            }
215                        }
216                    }
217                }
218            }
219        }
220
221        images
222    }
223}