use anyhow::{Context, Result, anyhow};
use std::fs::File;
use std::io::{Cursor, Read, Write};
use std::path::Path;
use xmltree::{Element, XMLNode};
use zip::{ZipArchive, ZipWriter, write::SimpleFileOptions};
use crate::models::{FileKind, SanitizationPlan};
use crate::sanitizers::Sanitizer;
const OOXML_METADATA_FILES: &[&str] = &[
"docProps/app.xml",
"docProps/core.xml",
"docProps/custom.xml",
];
pub struct OoXmlSanitizer;
impl Sanitizer for OoXmlSanitizer {
fn kind(&self) -> FileKind {
FileKind::OfficeOpenXml
}
fn plan(&self, input: &Path) -> Result<SanitizationPlan> {
let file = File::open(input)
.with_context(|| format!("failed to open OOXML archive {}", input.display()))?;
let mut archive = ZipArchive::new(file)
.with_context(|| format!("failed to read OOXML archive {}", input.display()))?;
let mut removed_items = Vec::new();
for index in 0..archive.len() {
let mut entry = archive.by_index(index)?;
let name = entry.name().to_string();
if is_metadata_file(&name) {
let mut bytes = Vec::new();
entry.read_to_end(&mut bytes)?;
let element =
parse_xml(&bytes).with_context(|| format!("failed to parse {name}"))?;
let removed_children = metadata_child_names(&element);
if !removed_children.is_empty() {
removed_items.push(format!(
"{name}: cleared {} child nodes",
removed_children.len()
));
}
}
}
Ok(SanitizationPlan { removed_items })
}
fn sanitize(&self, input: &Path, output: &mut File) -> Result<SanitizationPlan> {
let file = File::open(input)
.with_context(|| format!("failed to open OOXML archive {}", input.display()))?;
let mut archive = ZipArchive::new(file)
.with_context(|| format!("failed to read OOXML archive {}", input.display()))?;
let mut writer = ZipWriter::new(&mut *output).set_auto_large_file();
let options = SimpleFileOptions::default();
let mut removed_items = Vec::new();
for index in 0..archive.len() {
let mut entry = archive.by_index(index)?;
let name = entry.name().to_string();
if name.ends_with('/') {
writer.add_directory_from_path(Path::new(&name), options)?;
continue;
}
if is_metadata_file(&name) {
let mut bytes = Vec::new();
entry.read_to_end(&mut bytes)?;
let mut element =
parse_xml(&bytes).with_context(|| format!("failed to parse {name}"))?;
let removed_children = clear_metadata_children(&mut element);
let mut buffer = Vec::new();
element
.write(&mut buffer)
.with_context(|| format!("failed to write sanitized XML for {name}"))?;
writer.start_file_from_path(Path::new(&name), options)?;
writer.write_all(&buffer)?;
if !removed_children.is_empty() {
removed_items.push(format!(
"{name}: cleared {} child nodes",
removed_children.len()
));
}
} else {
writer.raw_copy_file(entry)?;
}
}
let output = writer.finish()?;
output.flush()?;
Ok(SanitizationPlan { removed_items })
}
}
fn is_metadata_file(name: &str) -> bool {
OOXML_METADATA_FILES
.iter()
.any(|candidate| candidate == &name)
}
fn parse_xml(bytes: &[u8]) -> Result<Element> {
Element::parse(Cursor::new(bytes)).map_err(|error| anyhow!(error))
}
fn metadata_child_names(element: &Element) -> Vec<String> {
element
.children
.iter()
.filter_map(|node| match node {
XMLNode::Element(child) => Some(child.name.clone()),
_ => None,
})
.collect()
}
fn clear_metadata_children(element: &mut Element) -> Vec<String> {
let removed = metadata_child_names(element);
element.children.clear();
removed
}