web_capture/
localize_images.rs1use regex::Regex;
13use serde::{Deserialize, Serialize};
14use url::Url;
15
16#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct ImageReference {
19 pub full_match: String,
20 pub alt_text: String,
21 pub url: String,
22}
23
24#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct ImageMetadata {
27 pub index: usize,
28 pub original_url: String,
29 pub alt_text: String,
30 pub local_path: String,
31}
32
33#[derive(Debug, Clone)]
35pub struct ImageReplacement {
36 pub from: String,
37 pub to: String,
38 pub buffer: Option<Vec<u8>>,
39 pub filename: String,
40}
41
42#[derive(Debug, Clone)]
44pub struct LocalizeResult {
45 pub markdown: String,
46 pub downloaded: usize,
47 pub total: usize,
48 pub replacements: Vec<ImageReplacement>,
49 pub metadata: Vec<ImageMetadata>,
50}
51
52#[derive(Debug, Clone)]
54pub struct LocalizeOptions {
55 pub images_dir: String,
56 pub dry_run: bool,
57 pub exclude_domains: Vec<String>,
58}
59
60impl Default for LocalizeOptions {
61 fn default() -> Self {
62 Self {
63 images_dir: "images".to_string(),
64 dry_run: false,
65 exclude_domains: Vec::new(),
66 }
67 }
68}
69
70#[must_use]
72pub fn extract_image_references(markdown_text: &str) -> Vec<ImageReference> {
73 let re = Regex::new(r"!\[([^\]]*)\]\((https?://[^)]+)\)").unwrap();
74 let mut images = Vec::new();
75
76 for cap in re.captures_iter(markdown_text) {
77 images.push(ImageReference {
78 full_match: cap[0].to_string(),
79 alt_text: cap[1].to_string(),
80 url: cap[2].to_string(),
81 });
82 }
83
84 images
85}
86
87#[must_use]
89pub fn get_extension_from_url(url_str: &str) -> String {
90 if let Ok(parsed) = Url::parse(url_str) {
91 let path = parsed.path().split('?').next().unwrap_or("");
92 if let Some(ext_match) = Regex::new(r"\.(\w+)$")
93 .ok()
94 .and_then(|re| re.captures(path))
95 {
96 let lower = ext_match[1].to_lowercase();
97 if ["png", "jpg", "jpeg", "gif", "webp", "svg"].contains(&lower.as_str()) {
98 return format!(".{lower}");
99 }
100 }
101 }
102 ".png".to_string()
103}
104
105#[must_use]
107pub fn generate_local_filename(url: &str, index: usize) -> String {
108 let ext = get_extension_from_url(url);
109 format!("image-{:02}{ext}", index + 1)
110}
111
112pub async fn localize_images(markdown_text: &str, options: &LocalizeOptions) -> LocalizeResult {
118 let all_images = extract_image_references(markdown_text);
119
120 let external_images: Vec<&ImageReference> = all_images
122 .iter()
123 .filter(|img| {
124 if !img.url.starts_with("http") {
125 return false;
126 }
127 if img.url.contains(&format!("{}/", options.images_dir)) {
128 return false;
129 }
130 for domain in &options.exclude_domains {
131 if img.url.contains(domain) {
132 return false;
133 }
134 }
135 true
136 })
137 .collect();
138
139 if external_images.is_empty() {
140 return LocalizeResult {
141 markdown: markdown_text.to_string(),
142 downloaded: 0,
143 total: 0,
144 replacements: Vec::new(),
145 metadata: Vec::new(),
146 };
147 }
148
149 let mut replacements = Vec::new();
150 let mut metadata = Vec::new();
151 let mut downloaded_count = 0;
152 let mut updated_markdown = markdown_text.to_string();
153
154 for (i, image) in external_images.iter().enumerate() {
155 let local_filename = generate_local_filename(&image.url, i);
156 let relative_path = format!("{}/{local_filename}", options.images_dir);
157
158 if options.dry_run {
159 replacements.push(ImageReplacement {
160 from: image.full_match.clone(),
161 to: format!("", image.alt_text),
162 buffer: None,
163 filename: local_filename,
164 });
165 metadata.push(ImageMetadata {
166 index: i + 1,
167 original_url: image.url.clone(),
168 alt_text: image.alt_text.clone(),
169 local_path: relative_path,
170 });
171 continue;
172 }
173
174 if let Ok(buffer) = download_image(&image.url).await {
176 downloaded_count += 1;
177 replacements.push(ImageReplacement {
178 from: image.full_match.clone(),
179 to: format!("", image.alt_text),
180 buffer: Some(buffer),
181 filename: local_filename,
182 });
183 metadata.push(ImageMetadata {
184 index: i + 1,
185 original_url: image.url.clone(),
186 alt_text: image.alt_text.clone(),
187 local_path: relative_path,
188 });
189 }
190 }
192
193 for replacement in &replacements {
195 updated_markdown = updated_markdown.replace(&replacement.from, &replacement.to);
196 }
197
198 LocalizeResult {
199 markdown: updated_markdown,
200 downloaded: downloaded_count,
201 total: external_images.len(),
202 replacements,
203 metadata,
204 }
205}
206
207async fn download_image(url: &str) -> Result<Vec<u8>, String> {
209 let client = reqwest::Client::builder()
210 .build()
211 .map_err(|e| e.to_string())?;
212
213 for attempt in 0..3 {
214 match client.get(url).send().await {
215 Ok(resp) => {
216 if resp.status().is_success() {
217 match resp.bytes().await {
218 Ok(bytes) => return Ok(bytes.to_vec()),
219 Err(e) => {
220 if attempt == 2 {
221 return Err(e.to_string());
222 }
223 }
224 }
225 } else if attempt == 2 {
226 return Err(format!("HTTP {}", resp.status()));
227 }
228 }
229 Err(e) => {
230 if attempt == 2 {
231 return Err(e.to_string());
232 }
233 }
234 }
235 tokio::time::sleep(std::time::Duration::from_secs(1)).await;
236 }
237 Err("Max retries exceeded".to_string())
238}