Skip to main content

web_capture/
localize_images.rs

1//! Markdown image localization module (R5).
2//!
3//! Post-processing tool that:
4//! 1. Reads markdown text
5//! 2. Extracts all external image URLs
6//! 3. Downloads images to local directory
7//! 4. Updates markdown to reference local paths
8//!
9//! Based on reference implementation from:
10//! <https://github.com/link-foundation/meta-theory/blob/main/scripts/download-markdown-images.mjs>
11
12use regex::Regex;
13use serde::{Deserialize, Serialize};
14use url::Url;
15
16/// An image reference extracted from markdown.
17#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct ImageReference {
19    pub full_match: String,
20    pub alt_text: String,
21    pub url: String,
22}
23
24/// Metadata about a localized image.
25#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct ImageMetadata {
27    pub index: usize,
28    pub original_url: String,
29    pub alt_text: String,
30    pub local_path: String,
31}
32
33/// A replacement to apply to markdown text.
34#[derive(Debug, Clone)]
35pub struct ImageReplacement {
36    pub from: String,
37    pub to: String,
38    pub buffer: Option<Vec<u8>>,
39    pub filename: String,
40}
41
42/// Result of localizing images.
43#[derive(Debug, Clone)]
44pub struct LocalizeResult {
45    pub markdown: String,
46    pub downloaded: usize,
47    pub total: usize,
48    pub replacements: Vec<ImageReplacement>,
49    pub metadata: Vec<ImageMetadata>,
50}
51
52/// Options for localizing images.
53#[derive(Debug, Clone)]
54pub struct LocalizeOptions {
55    pub images_dir: String,
56    pub dry_run: bool,
57    pub exclude_domains: Vec<String>,
58}
59
60impl Default for LocalizeOptions {
61    fn default() -> Self {
62        Self {
63            images_dir: "images".to_string(),
64            dry_run: false,
65            exclude_domains: Vec::new(),
66        }
67    }
68}
69
70/// Extract image references from markdown text.
71#[must_use]
72pub fn extract_image_references(markdown_text: &str) -> Vec<ImageReference> {
73    let re = Regex::new(r"!\[([^\]]*)\]\((https?://[^)]+)\)").unwrap();
74    let mut images = Vec::new();
75
76    for cap in re.captures_iter(markdown_text) {
77        images.push(ImageReference {
78            full_match: cap[0].to_string(),
79            alt_text: cap[1].to_string(),
80            url: cap[2].to_string(),
81        });
82    }
83
84    images
85}
86
87/// Get file extension from URL.
88#[must_use]
89pub fn get_extension_from_url(url_str: &str) -> String {
90    if let Ok(parsed) = Url::parse(url_str) {
91        let path = parsed.path().split('?').next().unwrap_or("");
92        if let Some(ext_match) = Regex::new(r"\.(\w+)$")
93            .ok()
94            .and_then(|re| re.captures(path))
95        {
96            let lower = ext_match[1].to_lowercase();
97            if ["png", "jpg", "jpeg", "gif", "webp", "svg"].contains(&lower.as_str()) {
98                return format!(".{lower}");
99            }
100        }
101    }
102    ".png".to_string()
103}
104
105/// Generate local filename for a downloaded image.
106#[must_use]
107pub fn generate_local_filename(url: &str, index: usize) -> String {
108    let ext = get_extension_from_url(url);
109    format!("image-{:02}{ext}", index + 1)
110}
111
112/// Localize images in markdown text by downloading external images
113/// and replacing URLs with local paths.
114///
115/// Note: In the Rust implementation, actual downloading requires `reqwest`.
116/// The `dry_run` mode works without network access.
117pub async fn localize_images(markdown_text: &str, options: &LocalizeOptions) -> LocalizeResult {
118    let all_images = extract_image_references(markdown_text);
119
120    // Filter to only external images not already localized
121    let external_images: Vec<&ImageReference> = all_images
122        .iter()
123        .filter(|img| {
124            if !img.url.starts_with("http") {
125                return false;
126            }
127            if img.url.contains(&format!("{}/", options.images_dir)) {
128                return false;
129            }
130            for domain in &options.exclude_domains {
131                if img.url.contains(domain) {
132                    return false;
133                }
134            }
135            true
136        })
137        .collect();
138
139    if external_images.is_empty() {
140        return LocalizeResult {
141            markdown: markdown_text.to_string(),
142            downloaded: 0,
143            total: 0,
144            replacements: Vec::new(),
145            metadata: Vec::new(),
146        };
147    }
148
149    let mut replacements = Vec::new();
150    let mut metadata = Vec::new();
151    let mut downloaded_count = 0;
152    let mut updated_markdown = markdown_text.to_string();
153
154    for (i, image) in external_images.iter().enumerate() {
155        let local_filename = generate_local_filename(&image.url, i);
156        let relative_path = format!("{}/{local_filename}", options.images_dir);
157
158        if options.dry_run {
159            replacements.push(ImageReplacement {
160                from: image.full_match.clone(),
161                to: format!("![{}]({relative_path})", image.alt_text),
162                buffer: None,
163                filename: local_filename,
164            });
165            metadata.push(ImageMetadata {
166                index: i + 1,
167                original_url: image.url.clone(),
168                alt_text: image.alt_text.clone(),
169                local_path: relative_path,
170            });
171            continue;
172        }
173
174        // Download the image
175        if let Ok(buffer) = download_image(&image.url).await {
176            downloaded_count += 1;
177            replacements.push(ImageReplacement {
178                from: image.full_match.clone(),
179                to: format!("![{}]({relative_path})", image.alt_text),
180                buffer: Some(buffer),
181                filename: local_filename,
182            });
183            metadata.push(ImageMetadata {
184                index: i + 1,
185                original_url: image.url.clone(),
186                alt_text: image.alt_text.clone(),
187                local_path: relative_path,
188            });
189        }
190        // Keep original URL if download fails
191    }
192
193    // Apply replacements to markdown
194    for replacement in &replacements {
195        updated_markdown = updated_markdown.replace(&replacement.from, &replacement.to);
196    }
197
198    LocalizeResult {
199        markdown: updated_markdown,
200        downloaded: downloaded_count,
201        total: external_images.len(),
202        replacements,
203        metadata,
204    }
205}
206
207/// Download an image from a URL with retry.
208async fn download_image(url: &str) -> Result<Vec<u8>, String> {
209    let client = reqwest::Client::builder()
210        .build()
211        .map_err(|e| e.to_string())?;
212
213    for attempt in 0..3 {
214        match client.get(url).send().await {
215            Ok(resp) => {
216                if resp.status().is_success() {
217                    match resp.bytes().await {
218                        Ok(bytes) => return Ok(bytes.to_vec()),
219                        Err(e) => {
220                            if attempt == 2 {
221                                return Err(e.to_string());
222                            }
223                        }
224                    }
225                } else if attempt == 2 {
226                    return Err(format!("HTTP {}", resp.status()));
227                }
228            }
229            Err(e) => {
230                if attempt == 2 {
231                    return Err(e.to_string());
232                }
233            }
234        }
235        tokio::time::sleep(std::time::Duration::from_secs(1)).await;
236    }
237    Err("Max retries exceeded".to_string())
238}