halldyll_core/parse/
images.rs1use regex::Regex;
4use scraper::{Html, Selector};
5use url::Url;
6
7use crate::types::assets::{ImageAsset, ImageSourceType, SrcsetEntry};
8
9pub struct ImageExtractor {
11 resolve_lazy: bool,
13 extract_css_backgrounds: bool,
15}
16
17impl Default for ImageExtractor {
18 fn default() -> Self {
19 Self {
20 resolve_lazy: true,
21 extract_css_backgrounds: false,
22 }
23 }
24}
25
26impl ImageExtractor {
27 pub fn new() -> Self {
29 Self::default()
30 }
31
32 pub fn with_options(mut self, resolve_lazy: bool, extract_css_backgrounds: bool) -> Self {
34 self.resolve_lazy = resolve_lazy;
35 self.extract_css_backgrounds = extract_css_backgrounds;
36 self
37 }
38
39 pub fn extract(&self, html: &str, base_url: &Url) -> Vec<ImageAsset> {
41 let document = Html::parse_document(html);
42 let mut images = Vec::new();
43
44 images.extend(self.extract_img_tags(&document, base_url));
46
47 images.extend(self.extract_picture_tags(&document, base_url));
49
50 if self.extract_css_backgrounds {
52 images.extend(self.extract_css_backgrounds_from_style(&document, base_url));
53 }
54
55 images.sort_by(|a, b| a.url.as_str().cmp(b.url.as_str()));
57 images.dedup_by(|a, b| a.url == b.url);
58
59 images
60 }
61
62 fn extract_img_tags(&self, document: &Html, base_url: &Url) -> Vec<ImageAsset> {
64 let selector = Selector::parse("img").unwrap();
65 let mut images = Vec::new();
66
67 for img in document.select(&selector) {
68 let attrs = img.value();
69
70 let src = attrs.attr("src");
72
73 let lazy_src = if self.resolve_lazy {
75 attrs.attr("data-src")
76 .or_else(|| attrs.attr("data-lazy"))
77 .or_else(|| attrs.attr("data-original"))
78 .or_else(|| attrs.attr("loading") .filter(|_| attrs.attr("data-src").is_some()).and_then(|_| attrs.attr("data-src")))
79 } else {
80 None
81 };
82
83 let url_str = lazy_src.or(src);
85 let url = match url_str {
86 Some(s) if !s.is_empty() && !s.starts_with("data:") => {
87 base_url.join(s).ok()
88 }
89 _ => continue,
90 };
91
92 let url = match url {
93 Some(u) => u,
94 None => continue,
95 };
96
97 let alt = attrs.attr("alt").map(String::from);
99
100 let width = attrs.attr("width").and_then(|w| w.parse().ok());
102 let height = attrs.attr("height").and_then(|h| h.parse().ok());
103
104 let srcset = attrs.attr("srcset").map(|s| self.parse_srcset(s, base_url));
106
107 let lazy_src_url = if lazy_src.is_some() && src.is_some() {
109 lazy_src.and_then(|s| base_url.join(s).ok())
110 } else {
111 None
112 };
113
114 images.push(ImageAsset {
115 url,
116 alt,
117 width,
118 height,
119 srcset,
120 lazy_src: lazy_src_url,
121 file_size: None,
122 mime_type: None,
123 source_type: ImageSourceType::Img,
124 });
125 }
126
127 images
128 }
129
130 fn extract_picture_tags(&self, document: &Html, base_url: &Url) -> Vec<ImageAsset> {
132 let picture_selector = Selector::parse("picture").unwrap();
133 let source_selector = Selector::parse("source").unwrap();
134 let mut images = Vec::new();
135
136 for picture in document.select(&picture_selector) {
137 for source in picture.select(&source_selector) {
138 let attrs = source.value();
139
140 let srcset_str = attrs.attr("srcset").or_else(|| attrs.attr("src"));
142 if srcset_str.is_none() {
143 continue;
144 }
145
146 let srcset = self.parse_srcset(srcset_str.unwrap(), base_url);
147 if srcset.is_empty() {
148 continue;
149 }
150
151 let url = srcset[0].url.clone();
153
154 images.push(ImageAsset {
155 url,
156 alt: None,
157 width: None,
158 height: None,
159 srcset: Some(srcset),
160 lazy_src: None,
161 file_size: None,
162 mime_type: attrs.attr("type").map(String::from),
163 source_type: ImageSourceType::Picture,
164 });
165 }
166 }
167
168 images
169 }
170
171 fn parse_srcset(&self, srcset: &str, base_url: &Url) -> Vec<SrcsetEntry> {
173 srcset
174 .split(',')
175 .filter_map(|entry| {
176 let parts: Vec<&str> = entry.trim().split_whitespace().collect();
177 if parts.is_empty() {
178 return None;
179 }
180
181 let url = base_url.join(parts[0]).ok()?;
182 let descriptor = parts.get(1).map(|s| s.to_string()).unwrap_or_else(|| "1x".to_string());
183
184 Some(SrcsetEntry { url, descriptor })
185 })
186 .collect()
187 }
188
189 fn extract_css_backgrounds_from_style(&self, document: &Html, base_url: &Url) -> Vec<ImageAsset> {
191 let mut images = Vec::new();
192 let url_regex = Regex::new(r#"url\s*\(\s*['"]?([^'")\s]+)['"]?\s*\)"#).unwrap();
193
194 let all_selector = Selector::parse("[style]").unwrap();
196 for element in document.select(&all_selector) {
197 if let Some(style) = element.value().attr("style") {
198 for cap in url_regex.captures_iter(style) {
199 if let Some(url_match) = cap.get(1) {
200 let url_str = url_match.as_str();
201 if !url_str.starts_with("data:") {
202 if let Ok(url) = base_url.join(url_str) {
203 images.push(ImageAsset {
204 url,
205 alt: None,
206 width: None,
207 height: None,
208 srcset: None,
209 lazy_src: None,
210 file_size: None,
211 mime_type: None,
212 source_type: ImageSourceType::CssBackground,
213 });
214 }
215 }
216 }
217 }
218 }
219 }
220
221 images
222 }
223}