use base64::Engine;
use regex::Regex;
use std::collections::hash_map::DefaultHasher;
use std::fs;
use std::hash::{Hash, Hasher};
use std::path::Path;
use std::sync::OnceLock;
use tracing::debug;
fn base64_md_image_pattern() -> &'static Regex {
static PATTERN: OnceLock<Regex> = OnceLock::new();
PATTERN.get_or_init(|| {
Regex::new(r"!\[([^\]]*)\]\(data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^)]+)\)")
.unwrap()
})
}
#[derive(Debug, Clone)]
pub struct ExtractionResult {
pub markdown: String,
pub extracted: usize,
}
pub fn extract_and_save_images(
markdown: &str,
output_dir: &Path,
images_dir: &str,
) -> crate::Result<ExtractionResult> {
let images_path = output_dir.join(images_dir);
let mut images: Vec<(String, Vec<u8>)> = Vec::new();
let updated_markdown =
base64_md_image_pattern().replace_all(markdown, |caps: ®ex::Captures<'_>| {
let alt_text = &caps[1];
let mime_ext = &caps[2];
let base64_data = &caps[3];
let ext = match mime_ext {
"jpeg" => "jpg",
"svg+xml" => "svg",
other => other,
};
base64::engine::general_purpose::STANDARD
.decode(base64_data)
.map_or_else(
|_| format!(""),
|data| {
let mut hasher = DefaultHasher::new();
data.hash(&mut hasher);
let hash = format!("{:016x}", hasher.finish());
let hash_prefix = &hash[..8];
let filename = format!("image-{hash_prefix}.{ext}");
let relative_path = format!("{images_dir}/{filename}");
debug!("Extracted image: {} ({} bytes)", filename, data.len());
images.push((filename, data));
format!("")
},
)
});
let extracted = images.len();
if !images.is_empty() {
fs::create_dir_all(&images_path)?;
for (filename, data) in &images {
fs::write(images_path.join(filename), data)?;
}
}
Ok(ExtractionResult {
markdown: updated_markdown.into_owned(),
extracted,
})
}
pub fn extract_base64_to_buffers(
markdown: &str,
images_dir: &str,
) -> crate::Result<ExtractedBuffers> {
let mut images: Vec<ImageBuffer> = Vec::new();
let updated_markdown =
base64_md_image_pattern().replace_all(markdown, |caps: ®ex::Captures<'_>| {
let alt_text = &caps[1];
let mime_ext = &caps[2];
let base64_data = &caps[3];
let ext = match mime_ext {
"jpeg" => "jpg",
"svg+xml" => "svg",
other => other,
};
base64::engine::general_purpose::STANDARD
.decode(base64_data)
.map_or_else(
|_| format!(""),
|data| {
let mut hasher = DefaultHasher::new();
data.hash(&mut hasher);
let hash = format!("{:016x}", hasher.finish());
let hash_prefix = &hash[..8];
let filename = format!("image-{hash_prefix}.{ext}");
let relative_path = format!("{images_dir}/{filename}");
images.push(ImageBuffer { filename, data });
format!("")
},
)
});
Ok(ExtractedBuffers {
markdown: updated_markdown.into_owned(),
images,
})
}
#[derive(Debug, Clone)]
pub struct ExtractedBuffers {
pub markdown: String,
pub images: Vec<ImageBuffer>,
}
#[derive(Debug, Clone)]
pub struct ImageBuffer {
pub filename: String,
pub data: Vec<u8>,
}
#[must_use]
pub fn strip_base64_images(markdown: &str) -> StrippedResult {
let mut stripped = 0;
let updated = base64_md_image_pattern().replace_all(markdown, |caps: ®ex::Captures<'_>| {
stripped += 1;
let alt_text = &caps[1];
if alt_text.is_empty() {
String::new()
} else {
format!("*[image: {alt_text}]*")
}
});
StrippedResult {
markdown: updated.into_owned(),
stripped,
}
}
#[derive(Debug, Clone)]
pub struct StrippedResult {
pub markdown: String,
pub stripped: usize,
}
#[must_use]
pub fn has_base64_images(markdown: &str) -> bool {
base64_md_image_pattern().is_match(markdown)
}