1use base64::Engine;
4use regex::Regex;
5use std::collections::hash_map::DefaultHasher;
6use std::fs;
7use std::hash::{Hash, Hasher};
8use std::path::{Path, PathBuf};
9use std::sync::OnceLock;
10use tracing::{debug, warn};
11
12fn base64_md_image_pattern() -> &'static Regex {
13 static PATTERN: OnceLock<Regex> = OnceLock::new();
14 PATTERN.get_or_init(|| {
15 Regex::new(
23 r#"!\[([^\]]*)\]\(data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([A-Za-z0-9+/=]+)(?:\s+"[^"]*")?\)"#,
24 )
25 .unwrap()
26 })
27}
28
29#[derive(Debug, Clone)]
31pub struct ExtractionResult {
32 pub markdown: String,
34 pub extracted: usize,
36}
37
38pub fn extract_and_save_images(
51 markdown: &str,
52 output_dir: &Path,
53 images_dir: &str,
54) -> crate::Result<ExtractionResult> {
55 let images_path = output_dir.join(images_dir);
56 let mut images: Vec<(String, Vec<u8>)> = Vec::new();
57
58 let updated_markdown =
59 base64_md_image_pattern().replace_all(markdown, |caps: ®ex::Captures<'_>| {
60 let alt_text = &caps[1];
61 let mime_ext = &caps[2];
62 let base64_data = &caps[3];
63
64 let ext = match mime_ext {
65 "jpeg" => "jpg",
66 "svg+xml" => "svg",
67 other => other,
68 };
69
70 base64::engine::general_purpose::STANDARD
71 .decode(base64_data)
72 .map_or_else(
73 |_| format!(""),
74 |data| {
75 let mut hasher = DefaultHasher::new();
76 data.hash(&mut hasher);
77 let hash = format!("{:016x}", hasher.finish());
78 let hash_prefix = &hash[..8];
79 let filename = format!("image-{hash_prefix}.{ext}");
80 let relative_path = format!("{images_dir}/{filename}");
81 debug!("Extracted image: {} ({} bytes)", filename, data.len());
82 images.push((filename, data));
83 format!("")
84 },
85 )
86 });
87
88 let extracted = images.len();
89
90 if !images.is_empty() {
91 fs::create_dir_all(&images_path)?;
92 for (filename, data) in &images {
93 fs::write(images_path.join(filename), data)?;
94 }
95 }
96
97 Ok(ExtractionResult {
98 markdown: updated_markdown.into_owned(),
99 extracted,
100 })
101}
102
103pub fn extract_base64_to_buffers(
106 markdown: &str,
107 images_dir: &str,
108) -> crate::Result<ExtractedBuffers> {
109 let mut images: Vec<ImageBuffer> = Vec::new();
110
111 let updated_markdown =
112 base64_md_image_pattern().replace_all(markdown, |caps: ®ex::Captures<'_>| {
113 let alt_text = &caps[1];
114 let mime_ext = &caps[2];
115 let base64_data = &caps[3];
116
117 let ext = match mime_ext {
118 "jpeg" => "jpg",
119 "svg+xml" => "svg",
120 other => other,
121 };
122
123 base64::engine::general_purpose::STANDARD
124 .decode(base64_data)
125 .map_or_else(
126 |_| format!(""),
127 |data| {
128 let mut hasher = DefaultHasher::new();
129 data.hash(&mut hasher);
130 let hash = format!("{:016x}", hasher.finish());
131 let hash_prefix = &hash[..8];
132 let filename = format!("image-{hash_prefix}.{ext}");
133 let relative_path = format!("{images_dir}/{filename}");
134 images.push(ImageBuffer { filename, data });
135 format!("")
136 },
137 )
138 });
139
140 Ok(ExtractedBuffers {
141 markdown: updated_markdown.into_owned(),
142 images,
143 })
144}
145
146#[derive(Debug, Clone)]
148pub struct ExtractedBuffers {
149 pub markdown: String,
150 pub images: Vec<ImageBuffer>,
151}
152
153#[derive(Debug, Clone)]
155pub struct ImageBuffer {
156 pub filename: String,
157 pub data: Vec<u8>,
158}
159
160#[must_use]
168pub fn strip_base64_images(markdown: &str) -> StrippedResult {
169 let mut stripped = 0;
170 let updated = base64_md_image_pattern().replace_all(markdown, |caps: ®ex::Captures<'_>| {
171 stripped += 1;
172 let alt_text = &caps[1];
173 if alt_text.is_empty() {
174 "![]()".to_string()
175 } else {
176 format!("*[image: {alt_text}]*")
177 }
178 });
179 StrippedResult {
180 markdown: updated.into_owned(),
181 stripped,
182 }
183}
184
185#[derive(Debug, Clone)]
187pub struct StrippedResult {
188 pub markdown: String,
189 pub stripped: usize,
190}
191
192#[must_use]
194pub fn has_base64_images(markdown: &str) -> bool {
195 base64_md_image_pattern().is_match(markdown)
196}
197
198fn remote_md_image_pattern() -> &'static Regex {
202 static PATTERN: OnceLock<Regex> = OnceLock::new();
203 PATTERN.get_or_init(|| {
204 Regex::new(r#"!\[([^\]]*)\]\((https?://[^)\s]+)(?:\s+"[^"]*")?\)"#).unwrap()
205 })
206}
207
208fn remote_image_extension(url: &str) -> &'static str {
209 let path = url.split(['?', '#']).next().unwrap_or(url);
210 let lower = path.rsplit('.').next().unwrap_or("").to_lowercase();
211 match lower.as_str() {
212 "jpg" | "jpeg" => "jpg",
213 "gif" => "gif",
214 "webp" => "webp",
215 "svg" => "svg",
216 _ => "png",
217 }
218}
219
220#[derive(Debug, Clone)]
226pub enum ImageMode {
227 Default,
232 Embed,
235 Extract { dir: PathBuf, subdir: String },
239}
240
241#[derive(Debug, Clone)]
244pub struct PendingRemoteImage {
245 pub url: String,
247 pub filename: String,
249}
250
251#[derive(Debug, Clone)]
253pub struct ApplyResult {
254 pub markdown: String,
256 pub extracted: usize,
258 pub stripped: usize,
260 pub pending_remote: Vec<PendingRemoteImage>,
262}
263
264pub fn apply_image_mode(
273 markdown: &str,
274 mode: ImageMode,
275 base_url: Option<&str>,
276) -> crate::Result<ApplyResult> {
277 let _ = base_url; match mode {
279 ImageMode::Embed => Ok(ApplyResult {
280 markdown: markdown.to_string(),
281 extracted: 0,
282 stripped: 0,
283 pending_remote: Vec::new(),
284 }),
285 ImageMode::Default => {
286 let result = strip_base64_images(markdown);
287 if result.stripped > 0 {
288 warn!(
289 "Stripped {} inline base64 image(s) for default markdown; \
290 use --embed-images to keep them inline or --extract-images to save files",
291 result.stripped
292 );
293 }
294 Ok(ApplyResult {
295 markdown: result.markdown,
296 extracted: 0,
297 stripped: result.stripped,
298 pending_remote: Vec::new(),
299 })
300 }
301 ImageMode::Extract { dir, subdir } => {
302 let extracted = extract_and_save_images(markdown, &dir, &subdir)?;
304 let mut pending_remote = Vec::new();
306 let mut index = 0usize;
307 let localized = remote_md_image_pattern()
308 .replace_all(&extracted.markdown, |caps: ®ex::Captures<'_>| {
309 let alt_text = &caps[1];
310 let url = caps[2].to_string();
311 index += 1;
312 let filename = format!("image-{index:02}.{}", remote_image_extension(&url));
313 let relative_path = format!("{subdir}/{filename}");
314 pending_remote.push(PendingRemoteImage { url, filename });
315 format!("")
316 })
317 .into_owned();
318 Ok(ApplyResult {
319 markdown: localized,
320 extracted: extracted.extracted,
321 stripped: 0,
322 pending_remote,
323 })
324 }
325 }
326}