use base64::Engine;
use regex::Regex;
use std::collections::hash_map::DefaultHasher;
use std::fs;
use std::hash::{Hash, Hasher};
use std::path::{Path, PathBuf};
use std::sync::OnceLock;
use tracing::{debug, warn};
fn base64_md_image_pattern() -> &'static Regex {
static PATTERN: OnceLock<Regex> = OnceLock::new();
PATTERN.get_or_init(|| {
Regex::new(
r#"!\[([^\]]*)\]\(data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([A-Za-z0-9+/=]+)(?:\s+"[^"]*")?\)"#,
)
.unwrap()
})
}
#[derive(Debug, Clone)]
pub struct ExtractionResult {
pub markdown: String,
pub extracted: usize,
}
pub fn extract_and_save_images(
markdown: &str,
output_dir: &Path,
images_dir: &str,
) -> crate::Result<ExtractionResult> {
let images_path = output_dir.join(images_dir);
let mut images: Vec<(String, Vec<u8>)> = Vec::new();
let updated_markdown =
base64_md_image_pattern().replace_all(markdown, |caps: ®ex::Captures<'_>| {
let alt_text = &caps[1];
let mime_ext = &caps[2];
let base64_data = &caps[3];
let ext = match mime_ext {
"jpeg" => "jpg",
"svg+xml" => "svg",
other => other,
};
base64::engine::general_purpose::STANDARD
.decode(base64_data)
.map_or_else(
|_| format!(""),
|data| {
let mut hasher = DefaultHasher::new();
data.hash(&mut hasher);
let hash = format!("{:016x}", hasher.finish());
let hash_prefix = &hash[..8];
let filename = format!("image-{hash_prefix}.{ext}");
let relative_path = format!("{images_dir}/{filename}");
debug!("Extracted image: {} ({} bytes)", filename, data.len());
images.push((filename, data));
format!("")
},
)
});
let extracted = images.len();
if !images.is_empty() {
fs::create_dir_all(&images_path)?;
for (filename, data) in &images {
fs::write(images_path.join(filename), data)?;
}
}
Ok(ExtractionResult {
markdown: updated_markdown.into_owned(),
extracted,
})
}
pub fn extract_base64_to_buffers(
markdown: &str,
images_dir: &str,
) -> crate::Result<ExtractedBuffers> {
let mut images: Vec<ImageBuffer> = Vec::new();
let updated_markdown =
base64_md_image_pattern().replace_all(markdown, |caps: ®ex::Captures<'_>| {
let alt_text = &caps[1];
let mime_ext = &caps[2];
let base64_data = &caps[3];
let ext = match mime_ext {
"jpeg" => "jpg",
"svg+xml" => "svg",
other => other,
};
base64::engine::general_purpose::STANDARD
.decode(base64_data)
.map_or_else(
|_| format!(""),
|data| {
let mut hasher = DefaultHasher::new();
data.hash(&mut hasher);
let hash = format!("{:016x}", hasher.finish());
let hash_prefix = &hash[..8];
let filename = format!("image-{hash_prefix}.{ext}");
let relative_path = format!("{images_dir}/{filename}");
images.push(ImageBuffer { filename, data });
format!("")
},
)
});
Ok(ExtractedBuffers {
markdown: updated_markdown.into_owned(),
images,
})
}
#[derive(Debug, Clone)]
pub struct ExtractedBuffers {
pub markdown: String,
pub images: Vec<ImageBuffer>,
}
#[derive(Debug, Clone)]
pub struct ImageBuffer {
pub filename: String,
pub data: Vec<u8>,
}
#[must_use]
pub fn strip_base64_images(markdown: &str) -> StrippedResult {
let mut stripped = 0;
let updated = base64_md_image_pattern().replace_all(markdown, |caps: ®ex::Captures<'_>| {
stripped += 1;
let alt_text = &caps[1];
if alt_text.is_empty() {
"![]()".to_string()
} else {
format!("*[image: {alt_text}]*")
}
});
StrippedResult {
markdown: updated.into_owned(),
stripped,
}
}
#[derive(Debug, Clone)]
pub struct StrippedResult {
pub markdown: String,
pub stripped: usize,
}
#[must_use]
pub fn has_base64_images(markdown: &str) -> bool {
base64_md_image_pattern().is_match(markdown)
}
fn remote_md_image_pattern() -> &'static Regex {
static PATTERN: OnceLock<Regex> = OnceLock::new();
PATTERN.get_or_init(|| {
Regex::new(r#"!\[([^\]]*)\]\((https?://[^)\s]+)(?:\s+"[^"]*")?\)"#).unwrap()
})
}
fn remote_image_extension(url: &str) -> &'static str {
let path = url.split(['?', '#']).next().unwrap_or(url);
let lower = path.rsplit('.').next().unwrap_or("").to_lowercase();
match lower.as_str() {
"jpg" | "jpeg" => "jpg",
"gif" => "gif",
"webp" => "webp",
"svg" => "svg",
_ => "png",
}
}
#[derive(Debug, Clone)]
pub enum ImageMode {
Default,
Embed,
Extract { dir: PathBuf, subdir: String },
}
#[derive(Debug, Clone)]
pub struct PendingRemoteImage {
pub url: String,
pub filename: String,
}
#[derive(Debug, Clone)]
pub struct ApplyResult {
pub markdown: String,
pub extracted: usize,
pub stripped: usize,
pub pending_remote: Vec<PendingRemoteImage>,
}
pub fn apply_image_mode(
markdown: &str,
mode: ImageMode,
base_url: Option<&str>,
) -> crate::Result<ApplyResult> {
let _ = base_url; match mode {
ImageMode::Embed => Ok(ApplyResult {
markdown: markdown.to_string(),
extracted: 0,
stripped: 0,
pending_remote: Vec::new(),
}),
ImageMode::Default => {
let result = strip_base64_images(markdown);
if result.stripped > 0 {
warn!(
"Stripped {} inline base64 image(s) for default markdown; \
use --embed-images to keep them inline or --extract-images to save files",
result.stripped
);
}
Ok(ApplyResult {
markdown: result.markdown,
extracted: 0,
stripped: result.stripped,
pending_remote: Vec::new(),
})
}
ImageMode::Extract { dir, subdir } => {
let extracted = extract_and_save_images(markdown, &dir, &subdir)?;
let mut pending_remote = Vec::new();
let mut index = 0usize;
let localized = remote_md_image_pattern()
.replace_all(&extracted.markdown, |caps: ®ex::Captures<'_>| {
let alt_text = &caps[1];
let url = caps[2].to_string();
index += 1;
let filename = format!("image-{index:02}.{}", remote_image_extension(&url));
let relative_path = format!("{subdir}/{filename}");
pending_remote.push(PendingRemoteImage { url, filename });
format!("")
})
.into_owned();
Ok(ApplyResult {
markdown: localized,
extracted: extracted.extracted,
stripped: 0,
pending_remote,
})
}
}
}