use crate::{error::Error, Result};
use base64::engine::general_purpose::STANDARD as BASE64;
use base64::Engine;
use rs_docx::document::Drawing;
use std::collections::HashMap;
use std::fs::{self, File};
use std::io::{Cursor, Read};
use std::path::{Path, PathBuf};
pub struct ImageExtractor {
mode: ImageMode,
source: ImageSource,
counter: usize,
}
enum ImageMode {
SaveToDir(PathBuf),
Inline,
Skip,
}
enum ImageSource {
Archive(zip::ZipArchive<File>),
ArchiveFromBytes(zip::ZipArchive<Cursor<Vec<u8>>>),
None,
}
impl ImageExtractor {
pub fn new_with_dir<P: AsRef<Path>>(docx_path: P, output_dir: PathBuf) -> Result<Self> {
fs::create_dir_all(&output_dir)?;
let file = File::open(docx_path.as_ref())?;
let archive = zip::ZipArchive::new(file)
.map_err(|e| Error::DocxParse(format!("Failed to open DOCX as ZIP: {}", e)))?;
Ok(Self {
mode: ImageMode::SaveToDir(output_dir),
source: ImageSource::Archive(archive),
counter: 0,
})
}
pub fn new_with_dir_from_bytes(bytes: &[u8], output_dir: PathBuf) -> Result<Self> {
fs::create_dir_all(&output_dir)?;
let cursor = Cursor::new(bytes.to_vec());
let archive = zip::ZipArchive::new(cursor)
.map_err(|e| Error::DocxParse(format!("Failed to open DOCX as ZIP: {}", e)))?;
Ok(Self {
mode: ImageMode::SaveToDir(output_dir),
source: ImageSource::ArchiveFromBytes(archive),
counter: 0,
})
}
pub fn new_inline<P: AsRef<Path>>(docx_path: P) -> Result<Self> {
let file = File::open(docx_path.as_ref())?;
let archive = zip::ZipArchive::new(file)
.map_err(|e| Error::DocxParse(format!("Failed to open DOCX as ZIP: {}", e)))?;
Ok(Self {
mode: ImageMode::Inline,
source: ImageSource::Archive(archive),
counter: 0,
})
}
pub fn new_inline_from_bytes(bytes: &[u8]) -> Result<Self> {
let cursor = Cursor::new(bytes.to_vec());
let archive = zip::ZipArchive::new(cursor)
.map_err(|e| Error::DocxParse(format!("Failed to open DOCX as ZIP: {}", e)))?;
Ok(Self {
mode: ImageMode::Inline,
source: ImageSource::ArchiveFromBytes(archive),
counter: 0,
})
}
pub fn new_skip() -> Self {
Self {
mode: ImageMode::Skip,
source: ImageSource::None,
counter: 0,
}
}
pub fn extract_from_drawing(
&mut self,
drawing: &Drawing,
rels: &HashMap<String, String>,
) -> Result<Option<String>> {
if matches!(self.mode, ImageMode::Skip) {
return Ok(None);
}
let blip_id = self.find_blip_id(drawing);
let Some(rel_id) = blip_id else {
return Ok(None);
};
let Some(image_path) = rels.get(&rel_id) else {
return Ok(None);
};
self.process_image(image_path)
}
fn find_blip_id(&self, drawing: &Drawing) -> Option<String> {
if let Some(inline) = &drawing.inline {
if let Some(graphic) = &inline.graphic {
if let Some(pic) = graphic.data.children.first() {
let embed = &pic.fill.blip.embed;
if !embed.is_empty() {
return Some(embed.to_string());
}
}
}
}
if let Some(anchor) = &drawing.anchor {
if let Some(graphic) = &anchor.graphic {
if let Some(pic) = graphic.data.children.first() {
let embed = &pic.fill.blip.embed;
if !embed.is_empty() {
return Some(embed.to_string());
}
}
}
}
None
}
pub fn extract_from_pict(
&mut self,
pict: &rs_docx::document::Pict,
rels: &HashMap<String, String>,
) -> Result<Option<String>> {
if matches!(self.mode, ImageMode::Skip) {
return Ok(None);
}
let blip_id = self.find_pict_blip_id(pict);
let Some(rel_id) = blip_id else {
return Ok(None);
};
let Some(image_path) = rels.get(&rel_id) else {
return Ok(None);
};
self.process_image(image_path)
}
fn find_pict_blip_id(&self, pict: &rs_docx::document::Pict) -> Option<String> {
if let Some(shape) = &pict.shape {
if let Some(img_data) = &shape.image_data {
if let Some(id) = &img_data.id {
return Some(id.to_string());
}
}
}
if let Some(rect) = &pict.rect {
if let Some(img_data) = &rect.image_data {
if let Some(id) = &img_data.id {
return Some(id.to_string());
}
}
}
None
}
fn process_image(&mut self, image_path: &str) -> Result<Option<String>> {
let image_data = self.read_image_from_docx(image_path)?;
self.counter += 1;
let ext = Path::new(image_path)
.extension()
.and_then(|e| e.to_str())
.unwrap_or("png");
match &self.mode {
ImageMode::SaveToDir(dir) => {
let filename = format!("image_{}.{}", self.counter, ext);
let output_path = dir.join(&filename);
fs::write(&output_path, &image_data)?;
Ok(Some(format!("", output_path.display())))
}
ImageMode::Inline => {
let mime_type = match ext.to_lowercase().as_str() {
"png" => "image/png",
"jpg" | "jpeg" => "image/jpeg",
"gif" => "image/gif",
"webp" => "image/webp",
"svg" => "image/svg+xml",
_ => "application/octet-stream",
};
let b64 = BASE64.encode(&image_data);
Ok(Some(format!(
"<img src=\"data:{};base64,{}\" alt=\"image\" />",
mime_type, b64
)))
}
ImageMode::Skip => Ok(None),
}
}
fn read_image_from_docx(&mut self, image_path: &str) -> Result<Vec<u8>> {
let full_path = if image_path.starts_with("word/") {
image_path.to_string()
} else {
format!("word/{}", image_path)
};
let paths_to_try: [&str; 2] = [full_path.as_str(), image_path];
match &mut self.source {
ImageSource::Archive(archive) => {
extract_from_cached_zip(archive, &paths_to_try, image_path)
}
ImageSource::ArchiveFromBytes(archive) => {
extract_from_cached_zip(archive, &paths_to_try, image_path)
}
ImageSource::None => Ok(Vec::new()),
}
}
}
fn extract_from_cached_zip<R: Read + std::io::Seek>(
archive: &mut zip::ZipArchive<R>,
paths_to_try: &[&str],
original_path: &str,
) -> Result<Vec<u8>> {
for path in paths_to_try {
if let Ok(mut entry) = archive.by_name(path) {
let mut data = Vec::new();
entry.read_to_end(&mut data)?;
return Ok(data);
}
}
Err(Error::MediaNotFound(original_path.to_string()))
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::{Seek, SeekFrom, Write};
fn make_docx_with_two_images() -> Vec<u8> {
let buf = Cursor::new(Vec::<u8>::new());
let mut zip = zip::ZipWriter::new(buf);
let options: zip::write::FileOptions<zip::write::ExtendedFileOptions> =
zip::write::FileOptions::default().compression_method(zip::CompressionMethod::Stored);
zip.start_file("word/media/image1.png", options.clone()).unwrap();
zip.write_all(b"PNG1").unwrap();
zip.start_file("word/media/image2.png", options).unwrap();
zip.write_all(b"PNG2").unwrap();
let mut finished = zip.finish().unwrap();
finished.seek(SeekFrom::Start(0)).unwrap();
finished.into_inner()
}
#[test]
fn test_multiple_images_from_same_archive() {
let docx_bytes = make_docx_with_two_images();
let mut extractor = ImageExtractor::new_inline_from_bytes(&docx_bytes)
.expect("Failed to create extractor");
let img1 = extractor
.read_image_from_docx("media/image1.png")
.expect("Failed to read image1");
assert_eq!(img1, b"PNG1");
let img2 = extractor
.read_image_from_docx("media/image2.png")
.expect("Failed to read image2");
assert_eq!(img2, b"PNG2");
}
#[test]
fn test_archive_opened_once_path_variant() {
let docx_bytes = make_docx_with_two_images();
let mut extractor = ImageExtractor::new_inline_from_bytes(&docx_bytes)
.expect("Failed to create extractor");
let r1 = extractor.read_image_from_docx("media/image1.png");
let r2 = extractor.read_image_from_docx("media/image2.png");
assert!(r1.is_ok());
assert!(r2.is_ok());
}
#[test]
fn test_missing_image_returns_error() {
let docx_bytes = make_docx_with_two_images();
let mut extractor = ImageExtractor::new_inline_from_bytes(&docx_bytes)
.expect("Failed to create extractor");
let result = extractor.read_image_from_docx("media/nonexistent.png");
assert!(result.is_err());
}
}