web_capture/
extract_images.rs1use base64::Engine;
4use regex::Regex;
5use std::collections::hash_map::DefaultHasher;
6use std::fs;
7use std::hash::{Hash, Hasher};
8use std::path::Path;
9use std::sync::OnceLock;
10use tracing::debug;
11
12fn base64_md_image_pattern() -> &'static Regex {
13 static PATTERN: OnceLock<Regex> = OnceLock::new();
14 PATTERN.get_or_init(|| {
15 Regex::new(r"!\[([^\]]*)\]\(data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^)]+)\)")
16 .unwrap()
17 })
18}
19
20#[derive(Debug, Clone)]
22pub struct ExtractionResult {
23 pub markdown: String,
25 pub extracted: usize,
27}
28
29pub fn extract_and_save_images(
42 markdown: &str,
43 output_dir: &Path,
44 images_dir: &str,
45) -> crate::Result<ExtractionResult> {
46 let images_path = output_dir.join(images_dir);
47 let mut images: Vec<(String, Vec<u8>)> = Vec::new();
48
49 let updated_markdown =
50 base64_md_image_pattern().replace_all(markdown, |caps: ®ex::Captures<'_>| {
51 let alt_text = &caps[1];
52 let mime_ext = &caps[2];
53 let base64_data = &caps[3];
54
55 let ext = match mime_ext {
56 "jpeg" => "jpg",
57 "svg+xml" => "svg",
58 other => other,
59 };
60
61 base64::engine::general_purpose::STANDARD
62 .decode(base64_data)
63 .map_or_else(
64 |_| format!(""),
65 |data| {
66 let mut hasher = DefaultHasher::new();
67 data.hash(&mut hasher);
68 let hash = format!("{:016x}", hasher.finish());
69 let hash_prefix = &hash[..8];
70 let filename = format!("image-{hash_prefix}.{ext}");
71 let relative_path = format!("{images_dir}/{filename}");
72 debug!("Extracted image: {} ({} bytes)", filename, data.len());
73 images.push((filename, data));
74 format!("")
75 },
76 )
77 });
78
79 let extracted = images.len();
80
81 if !images.is_empty() {
82 fs::create_dir_all(&images_path)?;
83 for (filename, data) in &images {
84 fs::write(images_path.join(filename), data)?;
85 }
86 }
87
88 Ok(ExtractionResult {
89 markdown: updated_markdown.into_owned(),
90 extracted,
91 })
92}
93
94pub fn extract_base64_to_buffers(
97 markdown: &str,
98 images_dir: &str,
99) -> crate::Result<ExtractedBuffers> {
100 let mut images: Vec<ImageBuffer> = Vec::new();
101
102 let updated_markdown =
103 base64_md_image_pattern().replace_all(markdown, |caps: ®ex::Captures<'_>| {
104 let alt_text = &caps[1];
105 let mime_ext = &caps[2];
106 let base64_data = &caps[3];
107
108 let ext = match mime_ext {
109 "jpeg" => "jpg",
110 "svg+xml" => "svg",
111 other => other,
112 };
113
114 base64::engine::general_purpose::STANDARD
115 .decode(base64_data)
116 .map_or_else(
117 |_| format!(""),
118 |data| {
119 let mut hasher = DefaultHasher::new();
120 data.hash(&mut hasher);
121 let hash = format!("{:016x}", hasher.finish());
122 let hash_prefix = &hash[..8];
123 let filename = format!("image-{hash_prefix}.{ext}");
124 let relative_path = format!("{images_dir}/{filename}");
125 images.push(ImageBuffer { filename, data });
126 format!("")
127 },
128 )
129 });
130
131 Ok(ExtractedBuffers {
132 markdown: updated_markdown.into_owned(),
133 images,
134 })
135}
136
137#[derive(Debug, Clone)]
139pub struct ExtractedBuffers {
140 pub markdown: String,
141 pub images: Vec<ImageBuffer>,
142}
143
144#[derive(Debug, Clone)]
146pub struct ImageBuffer {
147 pub filename: String,
148 pub data: Vec<u8>,
149}
150
151#[must_use]
153pub fn strip_base64_images(markdown: &str) -> StrippedResult {
154 let mut stripped = 0;
155 let updated = base64_md_image_pattern().replace_all(markdown, |caps: ®ex::Captures<'_>| {
156 stripped += 1;
157 let alt_text = &caps[1];
158 if alt_text.is_empty() {
159 String::new()
160 } else {
161 format!("*[image: {alt_text}]*")
162 }
163 });
164 StrippedResult {
165 markdown: updated.into_owned(),
166 stripped,
167 }
168}
169
170#[derive(Debug, Clone)]
172pub struct StrippedResult {
173 pub markdown: String,
174 pub stripped: usize,
175}
176
177#[must_use]
179pub fn has_base64_images(markdown: &str) -> bool {
180 base64_md_image_pattern().is_match(markdown)
181}