Skip to main content

web_capture/
extract_images.rs

1//! Extract base64 data URI images from markdown and save as files.
2
3use base64::Engine;
4use regex::Regex;
5use std::collections::hash_map::DefaultHasher;
6use std::fs;
7use std::hash::{Hash, Hasher};
8use std::path::Path;
9use std::sync::OnceLock;
10use tracing::debug;
11
12fn base64_md_image_pattern() -> &'static Regex {
13    static PATTERN: OnceLock<Regex> = OnceLock::new();
14    PATTERN.get_or_init(|| {
15        Regex::new(r"!\[([^\]]*)\]\(data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^)]+)\)")
16            .unwrap()
17    })
18}
19
20/// Result of extracting images from markdown.
21#[derive(Debug, Clone)]
22pub struct ExtractionResult {
23    /// Updated markdown with local image paths.
24    pub markdown: String,
25    /// Number of images extracted.
26    pub extracted: usize,
27}
28
29/// Extract base64 data URI images from markdown, save them as files,
30/// and rewrite references to relative paths.
31///
32/// # Arguments
33///
34/// * `markdown` - Markdown content with data URI images
35/// * `output_dir` - Directory where the markdown file is being written
36/// * `images_dir` - Subdirectory name for images (default: "images")
37///
38/// # Errors
39///
40/// Returns an error if file I/O fails.
41pub fn extract_and_save_images(
42    markdown: &str,
43    output_dir: &Path,
44    images_dir: &str,
45) -> crate::Result<ExtractionResult> {
46    let images_path = output_dir.join(images_dir);
47    let mut images: Vec<(String, Vec<u8>)> = Vec::new();
48
49    let updated_markdown =
50        base64_md_image_pattern().replace_all(markdown, |caps: &regex::Captures<'_>| {
51            let alt_text = &caps[1];
52            let mime_ext = &caps[2];
53            let base64_data = &caps[3];
54
55            let ext = match mime_ext {
56                "jpeg" => "jpg",
57                "svg+xml" => "svg",
58                other => other,
59            };
60
61            base64::engine::general_purpose::STANDARD
62                .decode(base64_data)
63                .map_or_else(
64                    |_| format!("![{alt_text}](data:image/{mime_ext};base64,{base64_data})"),
65                    |data| {
66                        let mut hasher = DefaultHasher::new();
67                        data.hash(&mut hasher);
68                        let hash = format!("{:016x}", hasher.finish());
69                        let hash_prefix = &hash[..8];
70                        let filename = format!("image-{hash_prefix}.{ext}");
71                        let relative_path = format!("{images_dir}/{filename}");
72                        debug!("Extracted image: {} ({} bytes)", filename, data.len());
73                        images.push((filename, data));
74                        format!("![{alt_text}]({relative_path})")
75                    },
76                )
77        });
78
79    let extracted = images.len();
80
81    if !images.is_empty() {
82        fs::create_dir_all(&images_path)?;
83        for (filename, data) in &images {
84            fs::write(images_path.join(filename), data)?;
85        }
86    }
87
88    Ok(ExtractionResult {
89        markdown: updated_markdown.into_owned(),
90        extracted,
91    })
92}
93
94/// Extract base64 images from markdown into memory buffers without writing to disk.
95/// Intended for streaming into archives.
96pub fn extract_base64_to_buffers(
97    markdown: &str,
98    images_dir: &str,
99) -> crate::Result<ExtractedBuffers> {
100    let mut images: Vec<ImageBuffer> = Vec::new();
101
102    let updated_markdown =
103        base64_md_image_pattern().replace_all(markdown, |caps: &regex::Captures<'_>| {
104            let alt_text = &caps[1];
105            let mime_ext = &caps[2];
106            let base64_data = &caps[3];
107
108            let ext = match mime_ext {
109                "jpeg" => "jpg",
110                "svg+xml" => "svg",
111                other => other,
112            };
113
114            base64::engine::general_purpose::STANDARD
115                .decode(base64_data)
116                .map_or_else(
117                    |_| format!("![{alt_text}](data:image/{mime_ext};base64,{base64_data})"),
118                    |data| {
119                        let mut hasher = DefaultHasher::new();
120                        data.hash(&mut hasher);
121                        let hash = format!("{:016x}", hasher.finish());
122                        let hash_prefix = &hash[..8];
123                        let filename = format!("image-{hash_prefix}.{ext}");
124                        let relative_path = format!("{images_dir}/{filename}");
125                        images.push(ImageBuffer { filename, data });
126                        format!("![{alt_text}]({relative_path})")
127                    },
128                )
129        });
130
131    Ok(ExtractedBuffers {
132        markdown: updated_markdown.into_owned(),
133        images,
134    })
135}
136
137/// Result of extracting base64 images to memory buffers.
138#[derive(Debug, Clone)]
139pub struct ExtractedBuffers {
140    pub markdown: String,
141    pub images: Vec<ImageBuffer>,
142}
143
144/// An extracted image as an in-memory buffer.
145#[derive(Debug, Clone)]
146pub struct ImageBuffer {
147    pub filename: String,
148    pub data: Vec<u8>,
149}
150
151/// Strip base64 data URI images from markdown, leaving alt text placeholders.
152#[must_use]
153pub fn strip_base64_images(markdown: &str) -> StrippedResult {
154    let mut stripped = 0;
155    let updated = base64_md_image_pattern().replace_all(markdown, |caps: &regex::Captures<'_>| {
156        stripped += 1;
157        let alt_text = &caps[1];
158        if alt_text.is_empty() {
159            String::new()
160        } else {
161            format!("*[image: {alt_text}]*")
162        }
163    });
164    StrippedResult {
165        markdown: updated.into_owned(),
166        stripped,
167    }
168}
169
170/// Result of stripping base64 images.
171#[derive(Debug, Clone)]
172pub struct StrippedResult {
173    pub markdown: String,
174    pub stripped: usize,
175}
176
177/// Check if markdown contains any base64 data URI images.
178#[must_use]
179pub fn has_base64_images(markdown: &str) -> bool {
180    base64_md_image_pattern().is_match(markdown)
181}