Skip to main content

web_capture/
extract_images.rs

1//! Extract base64 data URI images from markdown and save as files.
2
3use base64::Engine;
4use regex::Regex;
5use std::collections::hash_map::DefaultHasher;
6use std::fs;
7use std::hash::{Hash, Hasher};
8use std::path::{Path, PathBuf};
9use std::sync::OnceLock;
10use tracing::{debug, warn};
11
12fn base64_md_image_pattern() -> &'static Regex {
13    static PATTERN: OnceLock<Regex> = OnceLock::new();
14    PATTERN.get_or_init(|| {
15        // Capture groups:
16        //   1: alt text
17        //   2: image subtype (png|jpeg|...)
18        //   3: base64 payload — strictly alphabet/digits/+, /, =
19        // An optional trailing ` "title"` block is matched but discarded, so
20        // markdown like `![](data:...;base64,XYZ== "")` decodes cleanly
21        // instead of letting the empty title leak into the base64 payload.
22        Regex::new(
23            r#"!\[([^\]]*)\]\(data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([A-Za-z0-9+/=]+)(?:\s+"[^"]*")?\)"#,
24        )
25        .unwrap()
26    })
27}
28
29/// Result of extracting images from markdown.
30#[derive(Debug, Clone)]
31pub struct ExtractionResult {
32    /// Updated markdown with local image paths.
33    pub markdown: String,
34    /// Number of images extracted.
35    pub extracted: usize,
36}
37
38/// Extract base64 data URI images from markdown, save them as files,
39/// and rewrite references to relative paths.
40///
41/// # Arguments
42///
43/// * `markdown` - Markdown content with data URI images
44/// * `output_dir` - Directory where the markdown file is being written
45/// * `images_dir` - Subdirectory name for images (default: "images")
46///
47/// # Errors
48///
49/// Returns an error if file I/O fails.
50pub fn extract_and_save_images(
51    markdown: &str,
52    output_dir: &Path,
53    images_dir: &str,
54) -> crate::Result<ExtractionResult> {
55    let images_path = output_dir.join(images_dir);
56    let mut images: Vec<(String, Vec<u8>)> = Vec::new();
57
58    let updated_markdown =
59        base64_md_image_pattern().replace_all(markdown, |caps: &regex::Captures<'_>| {
60            let alt_text = &caps[1];
61            let mime_ext = &caps[2];
62            let base64_data = &caps[3];
63
64            let ext = match mime_ext {
65                "jpeg" => "jpg",
66                "svg+xml" => "svg",
67                other => other,
68            };
69
70            base64::engine::general_purpose::STANDARD
71                .decode(base64_data)
72                .map_or_else(
73                    |_| format!("![{alt_text}](data:image/{mime_ext};base64,{base64_data})"),
74                    |data| {
75                        let mut hasher = DefaultHasher::new();
76                        data.hash(&mut hasher);
77                        let hash = format!("{:016x}", hasher.finish());
78                        let hash_prefix = &hash[..8];
79                        let filename = format!("image-{hash_prefix}.{ext}");
80                        let relative_path = format!("{images_dir}/{filename}");
81                        debug!("Extracted image: {} ({} bytes)", filename, data.len());
82                        images.push((filename, data));
83                        format!("![{alt_text}]({relative_path})")
84                    },
85                )
86        });
87
88    let extracted = images.len();
89
90    if !images.is_empty() {
91        fs::create_dir_all(&images_path)?;
92        for (filename, data) in &images {
93            fs::write(images_path.join(filename), data)?;
94        }
95    }
96
97    Ok(ExtractionResult {
98        markdown: updated_markdown.into_owned(),
99        extracted,
100    })
101}
102
103/// Extract base64 images from markdown into memory buffers without writing to disk.
104/// Intended for streaming into archives.
105pub fn extract_base64_to_buffers(
106    markdown: &str,
107    images_dir: &str,
108) -> crate::Result<ExtractedBuffers> {
109    let mut images: Vec<ImageBuffer> = Vec::new();
110
111    let updated_markdown =
112        base64_md_image_pattern().replace_all(markdown, |caps: &regex::Captures<'_>| {
113            let alt_text = &caps[1];
114            let mime_ext = &caps[2];
115            let base64_data = &caps[3];
116
117            let ext = match mime_ext {
118                "jpeg" => "jpg",
119                "svg+xml" => "svg",
120                other => other,
121            };
122
123            base64::engine::general_purpose::STANDARD
124                .decode(base64_data)
125                .map_or_else(
126                    |_| format!("![{alt_text}](data:image/{mime_ext};base64,{base64_data})"),
127                    |data| {
128                        let mut hasher = DefaultHasher::new();
129                        data.hash(&mut hasher);
130                        let hash = format!("{:016x}", hasher.finish());
131                        let hash_prefix = &hash[..8];
132                        let filename = format!("image-{hash_prefix}.{ext}");
133                        let relative_path = format!("{images_dir}/{filename}");
134                        images.push(ImageBuffer { filename, data });
135                        format!("![{alt_text}]({relative_path})")
136                    },
137                )
138        });
139
140    Ok(ExtractedBuffers {
141        markdown: updated_markdown.into_owned(),
142        images,
143    })
144}
145
146/// Result of extracting base64 images to memory buffers.
147#[derive(Debug, Clone)]
148pub struct ExtractedBuffers {
149    pub markdown: String,
150    pub images: Vec<ImageBuffer>,
151}
152
153/// An extracted image as an in-memory buffer.
154#[derive(Debug, Clone)]
155pub struct ImageBuffer {
156    pub filename: String,
157    pub data: Vec<u8>,
158}
159
160/// Strip base64 data URI images from markdown, leaving a visible placeholder.
161///
162/// Non-empty alt becomes `*[image: <alt>]*`. Empty alt — common for Google
163/// Docs HTML exports, which emit `<img alt="" src="data:...">` for every
164/// image — becomes `![]()`, an empty markdown image reference that renderers
165/// still surface as a slot. Emitting `""` for empty-alt would silently delete
166/// every image in the document (see issue #117).
167#[must_use]
168pub fn strip_base64_images(markdown: &str) -> StrippedResult {
169    let mut stripped = 0;
170    let updated = base64_md_image_pattern().replace_all(markdown, |caps: &regex::Captures<'_>| {
171        stripped += 1;
172        let alt_text = &caps[1];
173        if alt_text.is_empty() {
174            "![]()".to_string()
175        } else {
176            format!("*[image: {alt_text}]*")
177        }
178    });
179    StrippedResult {
180        markdown: updated.into_owned(),
181        stripped,
182    }
183}
184
185/// Result of stripping base64 images.
186#[derive(Debug, Clone)]
187pub struct StrippedResult {
188    pub markdown: String,
189    pub stripped: usize,
190}
191
192/// Check if markdown contains any base64 data URI images.
193#[must_use]
194pub fn has_base64_images(markdown: &str) -> bool {
195    base64_md_image_pattern().is_match(markdown)
196}
197
198/// Matches a markdown image whose source is a remote `http(s)` URL. A trailing
199/// markdown title attribute (e.g. `![](url "caption")`) is matched but excluded
200/// from the captured URL.
201fn remote_md_image_pattern() -> &'static Regex {
202    static PATTERN: OnceLock<Regex> = OnceLock::new();
203    PATTERN.get_or_init(|| {
204        Regex::new(r#"!\[([^\]]*)\]\((https?://[^)\s]+)(?:\s+"[^"]*")?\)"#).unwrap()
205    })
206}
207
208fn remote_image_extension(url: &str) -> &'static str {
209    let path = url.split(['?', '#']).next().unwrap_or(url);
210    let lower = path.rsplit('.').next().unwrap_or("").to_lowercase();
211    match lower.as_str() {
212        "jpg" | "jpeg" => "jpg",
213        "gif" => "gif",
214        "webp" => "webp",
215        "svg" => "svg",
216        _ => "png",
217    }
218}
219
220/// How the unified image pipeline should treat images in captured markdown.
221///
222/// This is the single chokepoint every CLI/server capture path routes through,
223/// so the same flag produces the same result regardless of capture method
224/// (browser vs API, JS vs Rust). See issue #112.
225#[derive(Debug, Clone)]
226pub enum ImageMode {
227    /// Default `--format markdown` contract: keep remote URLs as **direct
228    /// links**, and strip inline base64 data URIs (which have no remote URL to
229    /// restore) down to a visible placeholder. No `images/` folder is written
230    /// and no multi-megabyte base64 blob is silently kept inline.
231    Default,
232    /// `--embed-images`: keep base64 data URIs inline so the output is a single
233    /// self-contained file.
234    Embed,
235    /// `--extract-images`: extract base64 images to files under `dir/subdir`,
236    /// and rewrite remote image references to the same local `subdir/` paths
237    /// (the remote bytes are downloaded by the caller — see `pending_remote`).
238    Extract { dir: PathBuf, subdir: String },
239}
240
241/// A remote image whose reference was rewritten to a local path by
242/// [`ImageMode::Extract`] but whose bytes still need to be downloaded.
243#[derive(Debug, Clone)]
244pub struct PendingRemoteImage {
245    /// Original remote URL.
246    pub url: String,
247    /// Local filename (relative to the images subdirectory).
248    pub filename: String,
249}
250
251/// Result of applying an [`ImageMode`] to markdown.
252#[derive(Debug, Clone)]
253pub struct ApplyResult {
254    /// Rewritten markdown.
255    pub markdown: String,
256    /// Number of base64 images extracted to disk.
257    pub extracted: usize,
258    /// Number of base64 images stripped to placeholders.
259    pub stripped: usize,
260    /// Remote images whose references were localized and still need downloading.
261    pub pending_remote: Vec<PendingRemoteImage>,
262}
263
264/// Apply an [`ImageMode`] to markdown — the single image-handling chokepoint.
265///
266/// `base_url` is reserved for resolving relative image URLs and is currently
267/// unused; callers pass the source document URL (or `None`).
268///
269/// # Errors
270///
271/// Returns an error if file I/O fails while extracting images to disk.
272pub fn apply_image_mode(
273    markdown: &str,
274    mode: ImageMode,
275    base_url: Option<&str>,
276) -> crate::Result<ApplyResult> {
277    let _ = base_url; // reserved for future relative-URL resolution
278    match mode {
279        ImageMode::Embed => Ok(ApplyResult {
280            markdown: markdown.to_string(),
281            extracted: 0,
282            stripped: 0,
283            pending_remote: Vec::new(),
284        }),
285        ImageMode::Default => {
286            let result = strip_base64_images(markdown);
287            if result.stripped > 0 {
288                warn!(
289                    "Stripped {} inline base64 image(s) for default markdown; \
290                     use --embed-images to keep them inline or --extract-images to save files",
291                    result.stripped
292                );
293            }
294            Ok(ApplyResult {
295                markdown: result.markdown,
296                extracted: 0,
297                stripped: result.stripped,
298                pending_remote: Vec::new(),
299            })
300        }
301        ImageMode::Extract { dir, subdir } => {
302            // 1. Extract inline base64 images to files.
303            let extracted = extract_and_save_images(markdown, &dir, &subdir)?;
304            // 2. Plan localization of remote image references to the same folder.
305            let mut pending_remote = Vec::new();
306            let mut index = 0usize;
307            let localized = remote_md_image_pattern()
308                .replace_all(&extracted.markdown, |caps: &regex::Captures<'_>| {
309                    let alt_text = &caps[1];
310                    let url = caps[2].to_string();
311                    index += 1;
312                    let filename = format!("image-{index:02}.{}", remote_image_extension(&url));
313                    let relative_path = format!("{subdir}/{filename}");
314                    pending_remote.push(PendingRemoteImage { url, filename });
315                    format!("![{alt_text}]({relative_path})")
316                })
317                .into_owned();
318            Ok(ApplyResult {
319                markdown: localized,
320                extracted: extracted.extracted,
321                stripped: 0,
322                pending_remote,
323            })
324        }
325    }
326}