essence/format/
image_processing.rs1use regex::Regex;
2use std::sync::LazyLock;
3
4pub fn parse_srcset_pick_largest(srcset: &str) -> Option<String> {
7 if srcset.trim().is_empty() {
8 return None;
9 }
10
11 let mut sources: Vec<ImageSource> = Vec::new();
12
13 for entry in srcset.split(',') {
14 let entry = entry.trim();
15 if entry.is_empty() {
16 continue;
17 }
18
19 let parts: Vec<&str> = entry.split_whitespace().collect();
21 if parts.is_empty() {
22 continue;
23 }
24
25 let url = parts[0].to_string();
26 let width = if parts.len() > 1 {
27 parse_width_descriptor(parts[1])
28 } else {
29 None
30 };
31
32 sources.push(ImageSource { url, width });
33 }
34
35 sources.sort_by(|a, b| {
37 b.width.unwrap_or(0).cmp(&a.width.unwrap_or(0))
38 });
39
40 sources.first().map(|s| s.url.clone())
41}
42
43fn parse_width_descriptor(desc: &str) -> Option<u32> {
45 if desc.ends_with('w') {
46 desc.trim_end_matches('w').parse().ok()
48 } else if desc.ends_with('x') {
49 let multiplier = desc.trim_end_matches('x').parse::<f32>().ok()?;
53 Some((multiplier * 600.0) as u32) } else {
55 None
56 }
57}
58
59#[derive(Debug, Clone)]
60struct ImageSource {
61 url: String,
62 width: Option<u32>,
63}
64
65pub fn resolve_srcsets(html: &str) -> String {
67 static RE_IMG_SRCSET: LazyLock<Regex> = LazyLock::new(|| {
68 Regex::new(r#"<img[^>]*srcset="[^"]*"[^>]*>"#).unwrap()
69 });
70 static RE_SRCSET_ATTR: LazyLock<Regex> = LazyLock::new(|| {
71 Regex::new(r#"srcset="([^"]*)""#).unwrap()
72 });
73 static RE_SRC_ATTR: LazyLock<Regex> = LazyLock::new(|| {
74 Regex::new(r#"src="[^"]*""#).unwrap()
75 });
76 static RE_SRCSET_REMOVE: LazyLock<Regex> = LazyLock::new(|| {
77 Regex::new(r#"\s*srcset="[^"]*""#).unwrap()
78 });
79
80 let mut result = html.to_string();
81 let mut replacements: Vec<(String, String)> = Vec::new();
82
83 for img_match in RE_IMG_SRCSET.find_iter(html) {
84 let old_tag = img_match.as_str();
85
86 if let Some(srcset_cap) = RE_SRCSET_ATTR.captures(old_tag) {
87 let srcset = &srcset_cap[1];
88
89 if let Some(largest) = parse_srcset_pick_largest(srcset) {
90 let mut new_tag = old_tag.to_string();
91
92 if RE_SRC_ATTR.is_match(&new_tag) {
93 new_tag = RE_SRC_ATTR.replace(&new_tag, &format!(r#"src="{}""#, largest)).to_string();
94 } else {
95 new_tag = new_tag.replace("<img ", &format!(r#"<img src="{}" "#, largest));
96 }
97
98 new_tag = RE_SRCSET_REMOVE.replace(&new_tag, "").to_string();
99 replacements.push((old_tag.to_string(), new_tag));
100 }
101 }
102 }
103
104 for (old, new) in replacements {
105 result = result.replace(&old, &new);
106 }
107
108 result
109}
110
111pub fn rescue_noscript_images(html: &str) -> String {
115 static RE_NOSCRIPT: LazyLock<Regex> = LazyLock::new(|| {
116 Regex::new(r"(?is)<noscript[^>]*>(.*?)</noscript>").unwrap()
117 });
118 static RE_IMG: LazyLock<Regex> = LazyLock::new(|| {
119 Regex::new(r"(?is)<img\s[^>]*>").unwrap()
120 });
121
122 let mut rescued = Vec::new();
123 for cap in RE_NOSCRIPT.captures_iter(html) {
124 let inner = &cap[1];
125 for img in RE_IMG.find_iter(inner) {
126 rescued.push(img.as_str().to_string());
127 }
128 }
129
130 if rescued.is_empty() {
131 return html.to_string();
132 }
133
134 let insertion = rescued.join("\n");
136 if let Some(pos) = html.to_lowercase().rfind("</body>") {
137 let mut result = html.to_string();
138 result.insert_str(pos, &format!("\n{}\n", insertion));
139 result
140 } else {
141 format!("{}\n{}", html, insertion)
142 }
143}
144
145pub fn resolve_picture_elements(html: &str) -> String {
147 static RE_PICTURE: LazyLock<Regex> = LazyLock::new(|| {
148 Regex::new(r"(?is)<picture[^>]*>(.*?)</picture>").unwrap()
149 });
150 static RE_SOURCE_SRCSET: LazyLock<Regex> = LazyLock::new(|| {
151 Regex::new(r#"(?is)<source[^>]*srcset\s*=\s*["']([^"']+)["'][^>]*>"#).unwrap()
152 });
153 static RE_IMG_TAG: LazyLock<Regex> = LazyLock::new(|| {
154 Regex::new(r"(?is)<img\s[^>]*>").unwrap()
155 });
156
157 RE_PICTURE.replace_all(html, |caps: ®ex::Captures| {
158 let inner = &caps[1];
159
160 let mut best_url: Option<String> = None;
162 for source_cap in RE_SOURCE_SRCSET.captures_iter(inner) {
163 let srcset = &source_cap[1];
164 if let Some(url) = parse_srcset_pick_largest(srcset) {
165 best_url = Some(url);
166 }
167 }
168
169 if let Some(url) = best_url {
171 if let Some(img_match) = RE_IMG_TAG.find(inner) {
173 let img_tag = img_match.as_str();
174 let alt_regex = Regex::new(r#"alt\s*=\s*["']([^"']*?)["']"#).unwrap();
175 let alt = alt_regex.captures(img_tag)
176 .map(|c| c[1].to_string())
177 .unwrap_or_default();
178 format!(r#"<img src="{}" alt="{}">"#, url, alt)
179 } else {
180 format!(r#"<img src="{}" alt="">"#, url)
181 }
182 } else if let Some(img_match) = RE_IMG_TAG.find(inner) {
183 img_match.as_str().to_string()
185 } else {
186 String::new()
187 }
188 }).to_string()
189}
190
191pub fn resolve_lazy_images(html: &str) -> String {
193 static RE_LAZY_IMG: LazyLock<Regex> = LazyLock::new(|| {
194 Regex::new(r#"(?is)<img\s[^>]*data-(?:src|lazy-src|original|lazy-load)\s*=\s*["'][^"']+["'][^>]*>"#).unwrap()
195 });
196 static RE_DATA_SRC: LazyLock<Regex> = LazyLock::new(|| {
197 Regex::new(r#"data-(?:src|lazy-src|original|lazy-load)\s*=\s*["']([^"']+)["']"#).unwrap()
198 });
199 static RE_HAS_REAL_SRC: LazyLock<Regex> = LazyLock::new(|| {
200 Regex::new(r#"(?i)\bsrc\s*=\s*["']([^"']+)["']"#).unwrap()
201 });
202
203 RE_LAZY_IMG.replace_all(html, |caps: ®ex::Captures| {
204 let tag = &caps[0];
205
206 if let Some(data_cap) = RE_DATA_SRC.captures(tag) {
208 let lazy_url = &data_cap[1];
209
210 if let Some(src_cap) = RE_HAS_REAL_SRC.captures(tag) {
212 if !src_cap[1].starts_with("data:") {
213 return tag.to_string();
214 }
215 }
216
217 let src_attr = Regex::new(r#"src\s*=\s*["'][^"']*["']"#).unwrap();
219 if src_attr.is_match(tag) {
220 src_attr.replace(tag, &format!(r#"src="{}""#, lazy_url)).to_string()
221 } else {
222 tag.replace("<img ", &format!(r#"<img src="{}" "#, lazy_url))
223 }
224 } else {
225 tag.to_string()
226 }
227 }).to_string()
228}
229
230pub fn resolve_video_posters(html: &str) -> String {
232 static RE_VIDEO_POSTER: LazyLock<Regex> = LazyLock::new(|| {
233 Regex::new(r#"(?is)<video[^>]*poster\s*=\s*["']([^"']+)["'][^>]*>.*?</video>"#).unwrap()
234 });
235
236 RE_VIDEO_POSTER.replace_all(html, |caps: ®ex::Captures| {
237 let poster_url = &caps[1];
238 let original = &caps[0];
239 format!(r#"{}<img src="{}" alt="Video poster">"#, original, poster_url)
241 }).to_string()
242}
243
244#[cfg(test)]
245mod tests {
246 use super::*;
247
248 #[test]
249 fn test_parse_srcset_width_descriptors() {
250 let srcset = "small.jpg 300w, medium.jpg 600w, large.jpg 1200w";
251 let largest = parse_srcset_pick_largest(srcset).unwrap();
252 assert_eq!(largest, "large.jpg");
253 }
254
255 #[test]
256 fn test_parse_srcset_density_descriptors() {
257 let srcset = "small.jpg 1x, large.jpg 2x";
258 let largest = parse_srcset_pick_largest(srcset).unwrap();
259 assert_eq!(largest, "large.jpg");
260 }
261
262 #[test]
263 fn test_parse_srcset_mixed() {
264 let srcset = "small.jpg 300w, medium.jpg 2x, large.jpg 1600w";
265 let largest = parse_srcset_pick_largest(srcset).unwrap();
266 assert_eq!(largest, "large.jpg");
267 }
268
269 #[test]
270 fn test_parse_srcset_single() {
271 let srcset = "image.jpg 500w";
272 let largest = parse_srcset_pick_largest(srcset).unwrap();
273 assert_eq!(largest, "image.jpg");
274 }
275
276 #[test]
277 fn test_parse_srcset_no_descriptor() {
278 let srcset = "image.jpg";
279 let largest = parse_srcset_pick_largest(srcset).unwrap();
280 assert_eq!(largest, "image.jpg");
281 }
282
283 #[test]
284 fn test_resolve_srcsets() {
285 let html = r#"<img srcset="small.jpg 300w, large.jpg 1200w" src="small.jpg" alt="Test">"#;
286 let result = resolve_srcsets(html);
287 assert!(result.contains("src=\"large.jpg\""));
288 assert!(!result.contains("srcset="));
289 }
290
291 #[test]
292 fn test_resolve_srcsets_no_existing_src() {
293 let html = r#"<img srcset="small.jpg 300w, large.jpg 1200w" alt="Test">"#;
294 let result = resolve_srcsets(html);
295 assert!(result.contains("src=\"large.jpg\""));
296 assert!(!result.contains("srcset="));
297 }
298
299 #[test]
300 fn test_resolve_srcsets_multiple_images() {
301 let html = r#"
302 <img srcset="img1-small.jpg 300w, img1-large.jpg 1200w" alt="First">
303 <img srcset="img2-small.jpg 400w, img2-large.jpg 1600w" alt="Second">
304 "#;
305 let result = resolve_srcsets(html);
306 assert!(result.contains("src=\"img1-large.jpg\""));
307 assert!(result.contains("src=\"img2-large.jpg\""));
308 assert!(!result.contains("srcset="));
309 }
310
311 #[test]
312 fn test_resolve_srcsets_preserves_other_attributes() {
313 let html = r#"<img width="100" height="100" srcset="small.jpg 300w, large.jpg 1200w" alt="Logo" class="image">"#;
314 let result = resolve_srcsets(html);
315 assert!(result.contains("width=\"100\""));
316 assert!(result.contains("height=\"100\""));
317 assert!(result.contains("alt=\"Logo\""));
318 assert!(result.contains("class=\"image\""));
319 assert!(result.contains("src=\"large.jpg\""));
320 assert!(!result.contains("srcset="));
321 }
322
323 #[test]
324 fn test_resolve_srcsets_no_srcset_unchanged() {
325 let html = r#"<img src="regular.jpg" alt="Normal">"#;
326 let result = resolve_srcsets(html);
327 assert_eq!(result, html);
328 }
329
330 #[test]
331 fn test_resolve_srcsets_retina_display() {
332 let html = r#"<img srcset="image.jpg 1x, image@2x.jpg 2x, image@3x.jpg 3x" alt="Retina">"#;
333 let result = resolve_srcsets(html);
334 assert!(result.contains("src=\"image@3x.jpg\""));
335 assert!(!result.contains("srcset="));
336 }
337
338 #[test]
339 fn test_parse_width_descriptor_pixels() {
340 assert_eq!(parse_width_descriptor("800w"), Some(800));
341 assert_eq!(parse_width_descriptor("1200w"), Some(1200));
342 }
343
344 #[test]
345 fn test_parse_width_descriptor_retina() {
346 assert_eq!(parse_width_descriptor("1x"), Some(600));
347 assert_eq!(parse_width_descriptor("2x"), Some(1200));
348 assert_eq!(parse_width_descriptor("3x"), Some(1800));
349 }
350
351 #[test]
352 fn test_parse_width_descriptor_invalid() {
353 assert_eq!(parse_width_descriptor("invalid"), None);
354 assert_eq!(parse_width_descriptor(""), None);
355 }
356
357 #[test]
358 fn test_parse_srcset_empty() {
359 let srcset = "";
360 assert_eq!(parse_srcset_pick_largest(srcset), None);
361 }
362
363 #[test]
364 fn test_parse_srcset_whitespace_only() {
365 let srcset = " ";
366 assert_eq!(parse_srcset_pick_largest(srcset), None);
367 }
368}