Skip to main content

essence/format/
image_processing.rs

1use regex::Regex;
2use std::sync::LazyLock;
3
4/// Parse a srcset attribute and return the largest image
5/// Example: "small.jpg 300w, medium.jpg 600w, large.jpg 1200w" -> "large.jpg"
6pub fn parse_srcset_pick_largest(srcset: &str) -> Option<String> {
7    if srcset.trim().is_empty() {
8        return None;
9    }
10
11    let mut sources: Vec<ImageSource> = Vec::new();
12
13    for entry in srcset.split(',') {
14        let entry = entry.trim();
15        if entry.is_empty() {
16            continue;
17        }
18
19        // Parse "url 300w" or "url 2x"
20        let parts: Vec<&str> = entry.split_whitespace().collect();
21        if parts.is_empty() {
22            continue;
23        }
24
25        let url = parts[0].to_string();
26        let width = if parts.len() > 1 {
27            parse_width_descriptor(parts[1])
28        } else {
29            None
30        };
31
32        sources.push(ImageSource { url, width });
33    }
34
35    // Sort by width (descending), then return first
36    sources.sort_by(|a, b| {
37        b.width.unwrap_or(0).cmp(&a.width.unwrap_or(0))
38    });
39
40    sources.first().map(|s| s.url.clone())
41}
42
43/// Parse width descriptor like "300w" or "2x"
44fn parse_width_descriptor(desc: &str) -> Option<u32> {
45    if desc.ends_with('w') {
46        // Width in pixels: "300w"
47        desc.trim_end_matches('w').parse().ok()
48    } else if desc.ends_with('x') {
49        // Pixel density: "2x" -> treat as width multiplier
50        // Use 600px as base (typical image container width)
51        // So 1x=600, 2x=1200, 3x=1800
52        let multiplier = desc.trim_end_matches('x').parse::<f32>().ok()?;
53        Some((multiplier * 600.0) as u32)  // Convert to pseudo-width
54    } else {
55        None
56    }
57}
58
59#[derive(Debug, Clone)]
60struct ImageSource {
61    url: String,
62    width: Option<u32>,
63}
64
65/// Resolve all srcset attributes in HTML to use largest image
66pub fn resolve_srcsets(html: &str) -> String {
67    static RE_IMG_SRCSET: LazyLock<Regex> = LazyLock::new(|| {
68        Regex::new(r#"<img[^>]*srcset="[^"]*"[^>]*>"#).unwrap()
69    });
70    static RE_SRCSET_ATTR: LazyLock<Regex> = LazyLock::new(|| {
71        Regex::new(r#"srcset="([^"]*)""#).unwrap()
72    });
73    static RE_SRC_ATTR: LazyLock<Regex> = LazyLock::new(|| {
74        Regex::new(r#"src="[^"]*""#).unwrap()
75    });
76    static RE_SRCSET_REMOVE: LazyLock<Regex> = LazyLock::new(|| {
77        Regex::new(r#"\s*srcset="[^"]*""#).unwrap()
78    });
79
80    let mut result = html.to_string();
81    let mut replacements: Vec<(String, String)> = Vec::new();
82
83    for img_match in RE_IMG_SRCSET.find_iter(html) {
84        let old_tag = img_match.as_str();
85
86        if let Some(srcset_cap) = RE_SRCSET_ATTR.captures(old_tag) {
87            let srcset = &srcset_cap[1];
88
89            if let Some(largest) = parse_srcset_pick_largest(srcset) {
90                let mut new_tag = old_tag.to_string();
91
92                if RE_SRC_ATTR.is_match(&new_tag) {
93                    new_tag = RE_SRC_ATTR.replace(&new_tag, &format!(r#"src="{}""#, largest)).to_string();
94                } else {
95                    new_tag = new_tag.replace("<img ", &format!(r#"<img src="{}" "#, largest));
96                }
97
98                new_tag = RE_SRCSET_REMOVE.replace(&new_tag, "").to_string();
99                replacements.push((old_tag.to_string(), new_tag));
100            }
101        }
102    }
103
104    for (old, new) in replacements {
105        result = result.replace(&old, &new);
106    }
107
108    result
109}
110
111/// Rescue <img> tags from <noscript> blocks before noscript gets stripped.
112/// Many sites put the real <img> inside <noscript> while using lazy-loading JS
113/// in the main content. This extracts those images so they survive stripping.
114pub fn rescue_noscript_images(html: &str) -> String {
115    static RE_NOSCRIPT: LazyLock<Regex> = LazyLock::new(|| {
116        Regex::new(r"(?is)<noscript[^>]*>(.*?)</noscript>").unwrap()
117    });
118    static RE_IMG: LazyLock<Regex> = LazyLock::new(|| {
119        Regex::new(r"(?is)<img\s[^>]*>").unwrap()
120    });
121
122    let mut rescued = Vec::new();
123    for cap in RE_NOSCRIPT.captures_iter(html) {
124        let inner = &cap[1];
125        for img in RE_IMG.find_iter(inner) {
126            rescued.push(img.as_str().to_string());
127        }
128    }
129
130    if rescued.is_empty() {
131        return html.to_string();
132    }
133
134    // Insert rescued images just before </body> or at the end
135    let insertion = rescued.join("\n");
136    if let Some(pos) = html.to_lowercase().rfind("</body>") {
137        let mut result = html.to_string();
138        result.insert_str(pos, &format!("\n{}\n", insertion));
139        result
140    } else {
141        format!("{}\n{}", html, insertion)
142    }
143}
144
145/// Resolve <picture> elements to simple <img> tags by picking the largest <source>.
146pub fn resolve_picture_elements(html: &str) -> String {
147    static RE_PICTURE: LazyLock<Regex> = LazyLock::new(|| {
148        Regex::new(r"(?is)<picture[^>]*>(.*?)</picture>").unwrap()
149    });
150    static RE_SOURCE_SRCSET: LazyLock<Regex> = LazyLock::new(|| {
151        Regex::new(r#"(?is)<source[^>]*srcset\s*=\s*["']([^"']+)["'][^>]*>"#).unwrap()
152    });
153    static RE_IMG_TAG: LazyLock<Regex> = LazyLock::new(|| {
154        Regex::new(r"(?is)<img\s[^>]*>").unwrap()
155    });
156
157    RE_PICTURE.replace_all(html, |caps: &regex::Captures| {
158        let inner = &caps[1];
159
160        // Try to find the best source from <source srcset="...">
161        let mut best_url: Option<String> = None;
162        for source_cap in RE_SOURCE_SRCSET.captures_iter(inner) {
163            let srcset = &source_cap[1];
164            if let Some(url) = parse_srcset_pick_largest(srcset) {
165                best_url = Some(url);
166            }
167        }
168
169        // If we found a <source>, build an <img> with that URL
170        if let Some(url) = best_url {
171            // Try to preserve alt from the fallback <img>
172            if let Some(img_match) = RE_IMG_TAG.find(inner) {
173                let img_tag = img_match.as_str();
174                let alt_regex = Regex::new(r#"alt\s*=\s*["']([^"']*?)["']"#).unwrap();
175                let alt = alt_regex.captures(img_tag)
176                    .map(|c| c[1].to_string())
177                    .unwrap_or_default();
178                format!(r#"<img src="{}" alt="{}">"#, url, alt)
179            } else {
180                format!(r#"<img src="{}" alt="">"#, url)
181            }
182        } else if let Some(img_match) = RE_IMG_TAG.find(inner) {
183            // No <source> found, just use the fallback <img>
184            img_match.as_str().to_string()
185        } else {
186            String::new()
187        }
188    }).to_string()
189}
190
191/// Resolve lazy-loaded images by promoting data-src, data-lazy-src, etc. to src.
192pub fn resolve_lazy_images(html: &str) -> String {
193    static RE_LAZY_IMG: LazyLock<Regex> = LazyLock::new(|| {
194        Regex::new(r#"(?is)<img\s[^>]*data-(?:src|lazy-src|original|lazy-load)\s*=\s*["'][^"']+["'][^>]*>"#).unwrap()
195    });
196    static RE_DATA_SRC: LazyLock<Regex> = LazyLock::new(|| {
197        Regex::new(r#"data-(?:src|lazy-src|original|lazy-load)\s*=\s*["']([^"']+)["']"#).unwrap()
198    });
199    static RE_HAS_REAL_SRC: LazyLock<Regex> = LazyLock::new(|| {
200        Regex::new(r#"(?i)\bsrc\s*=\s*["']([^"']+)["']"#).unwrap()
201    });
202
203    RE_LAZY_IMG.replace_all(html, |caps: &regex::Captures| {
204        let tag = &caps[0];
205
206        // Extract the lazy data-src URL
207        if let Some(data_cap) = RE_DATA_SRC.captures(tag) {
208            let lazy_url = &data_cap[1];
209
210            // If there's already a real src (not data: URI), keep the tag as-is
211            if let Some(src_cap) = RE_HAS_REAL_SRC.captures(tag) {
212                if !src_cap[1].starts_with("data:") {
213                    return tag.to_string();
214                }
215            }
216
217            // Replace or add src with the lazy URL
218            let src_attr = Regex::new(r#"src\s*=\s*["'][^"']*["']"#).unwrap();
219            if src_attr.is_match(tag) {
220                src_attr.replace(tag, &format!(r#"src="{}""#, lazy_url)).to_string()
221            } else {
222                tag.replace("<img ", &format!(r#"<img src="{}" "#, lazy_url))
223            }
224        } else {
225            tag.to_string()
226        }
227    }).to_string()
228}
229
230/// Extract video poster frames as images so they appear in markdown output.
231pub fn resolve_video_posters(html: &str) -> String {
232    static RE_VIDEO_POSTER: LazyLock<Regex> = LazyLock::new(|| {
233        Regex::new(r#"(?is)<video[^>]*poster\s*=\s*["']([^"']+)["'][^>]*>.*?</video>"#).unwrap()
234    });
235
236    RE_VIDEO_POSTER.replace_all(html, |caps: &regex::Captures| {
237        let poster_url = &caps[1];
238        let original = &caps[0];
239        // Keep the original video tag and append a poster image
240        format!(r#"{}<img src="{}" alt="Video poster">"#, original, poster_url)
241    }).to_string()
242}
243
244#[cfg(test)]
245mod tests {
246    use super::*;
247
248    #[test]
249    fn test_parse_srcset_width_descriptors() {
250        let srcset = "small.jpg 300w, medium.jpg 600w, large.jpg 1200w";
251        let largest = parse_srcset_pick_largest(srcset).unwrap();
252        assert_eq!(largest, "large.jpg");
253    }
254
255    #[test]
256    fn test_parse_srcset_density_descriptors() {
257        let srcset = "small.jpg 1x, large.jpg 2x";
258        let largest = parse_srcset_pick_largest(srcset).unwrap();
259        assert_eq!(largest, "large.jpg");
260    }
261
262    #[test]
263    fn test_parse_srcset_mixed() {
264        let srcset = "small.jpg 300w, medium.jpg 2x, large.jpg 1600w";
265        let largest = parse_srcset_pick_largest(srcset).unwrap();
266        assert_eq!(largest, "large.jpg");
267    }
268
269    #[test]
270    fn test_parse_srcset_single() {
271        let srcset = "image.jpg 500w";
272        let largest = parse_srcset_pick_largest(srcset).unwrap();
273        assert_eq!(largest, "image.jpg");
274    }
275
276    #[test]
277    fn test_parse_srcset_no_descriptor() {
278        let srcset = "image.jpg";
279        let largest = parse_srcset_pick_largest(srcset).unwrap();
280        assert_eq!(largest, "image.jpg");
281    }
282
283    #[test]
284    fn test_resolve_srcsets() {
285        let html = r#"<img srcset="small.jpg 300w, large.jpg 1200w" src="small.jpg" alt="Test">"#;
286        let result = resolve_srcsets(html);
287        assert!(result.contains("src=\"large.jpg\""));
288        assert!(!result.contains("srcset="));
289    }
290
291    #[test]
292    fn test_resolve_srcsets_no_existing_src() {
293        let html = r#"<img srcset="small.jpg 300w, large.jpg 1200w" alt="Test">"#;
294        let result = resolve_srcsets(html);
295        assert!(result.contains("src=\"large.jpg\""));
296        assert!(!result.contains("srcset="));
297    }
298
299    #[test]
300    fn test_resolve_srcsets_multiple_images() {
301        let html = r#"
302            <img srcset="img1-small.jpg 300w, img1-large.jpg 1200w" alt="First">
303            <img srcset="img2-small.jpg 400w, img2-large.jpg 1600w" alt="Second">
304        "#;
305        let result = resolve_srcsets(html);
306        assert!(result.contains("src=\"img1-large.jpg\""));
307        assert!(result.contains("src=\"img2-large.jpg\""));
308        assert!(!result.contains("srcset="));
309    }
310
311    #[test]
312    fn test_resolve_srcsets_preserves_other_attributes() {
313        let html = r#"<img width="100" height="100" srcset="small.jpg 300w, large.jpg 1200w" alt="Logo" class="image">"#;
314        let result = resolve_srcsets(html);
315        assert!(result.contains("width=\"100\""));
316        assert!(result.contains("height=\"100\""));
317        assert!(result.contains("alt=\"Logo\""));
318        assert!(result.contains("class=\"image\""));
319        assert!(result.contains("src=\"large.jpg\""));
320        assert!(!result.contains("srcset="));
321    }
322
323    #[test]
324    fn test_resolve_srcsets_no_srcset_unchanged() {
325        let html = r#"<img src="regular.jpg" alt="Normal">"#;
326        let result = resolve_srcsets(html);
327        assert_eq!(result, html);
328    }
329
330    #[test]
331    fn test_resolve_srcsets_retina_display() {
332        let html = r#"<img srcset="image.jpg 1x, image@2x.jpg 2x, image@3x.jpg 3x" alt="Retina">"#;
333        let result = resolve_srcsets(html);
334        assert!(result.contains("src=\"image@3x.jpg\""));
335        assert!(!result.contains("srcset="));
336    }
337
338    #[test]
339    fn test_parse_width_descriptor_pixels() {
340        assert_eq!(parse_width_descriptor("800w"), Some(800));
341        assert_eq!(parse_width_descriptor("1200w"), Some(1200));
342    }
343
344    #[test]
345    fn test_parse_width_descriptor_retina() {
346        assert_eq!(parse_width_descriptor("1x"), Some(600));
347        assert_eq!(parse_width_descriptor("2x"), Some(1200));
348        assert_eq!(parse_width_descriptor("3x"), Some(1800));
349    }
350
351    #[test]
352    fn test_parse_width_descriptor_invalid() {
353        assert_eq!(parse_width_descriptor("invalid"), None);
354        assert_eq!(parse_width_descriptor(""), None);
355    }
356
357    #[test]
358    fn test_parse_srcset_empty() {
359        let srcset = "";
360        assert_eq!(parse_srcset_pick_largest(srcset), None);
361    }
362
363    #[test]
364    fn test_parse_srcset_whitespace_only() {
365        let srcset = "   ";
366        assert_eq!(parse_srcset_pick_largest(srcset), None);
367    }
368}