eclipse-sanitizer 0.1.1

A fast Rust CLI for sanitizing metadata from documents and images
use anyhow::{Context, Result, anyhow};
use std::fs::File;
use std::io::{Cursor, Read, Write};
use std::path::Path;
use xmltree::{Element, XMLNode};
use zip::{ZipArchive, ZipWriter, write::SimpleFileOptions};

use crate::models::{FileKind, SanitizationPlan};
use crate::sanitizers::Sanitizer;

const OOXML_METADATA_FILES: &[&str] = &[
    "docProps/app.xml",
    "docProps/core.xml",
    "docProps/custom.xml",
];

pub struct OoXmlSanitizer;

impl Sanitizer for OoXmlSanitizer {
    fn kind(&self) -> FileKind {
        FileKind::OfficeOpenXml
    }

    fn plan(&self, input: &Path) -> Result<SanitizationPlan> {
        let file = File::open(input)
            .with_context(|| format!("failed to open OOXML archive {}", input.display()))?;
        let mut archive = ZipArchive::new(file)
            .with_context(|| format!("failed to read OOXML archive {}", input.display()))?;
        let mut removed_items = Vec::new();

        for index in 0..archive.len() {
            let mut entry = archive.by_index(index)?;
            let name = entry.name().to_string();

            if is_metadata_file(&name) {
                let mut bytes = Vec::new();
                entry.read_to_end(&mut bytes)?;
                let element =
                    parse_xml(&bytes).with_context(|| format!("failed to parse {name}"))?;
                let removed_children = metadata_child_names(&element);
                if !removed_children.is_empty() {
                    removed_items.push(format!(
                        "{name}: cleared {} child nodes",
                        removed_children.len()
                    ));
                }
            }
        }

        Ok(SanitizationPlan { removed_items })
    }

    fn sanitize(&self, input: &Path, output: &mut File) -> Result<SanitizationPlan> {
        let file = File::open(input)
            .with_context(|| format!("failed to open OOXML archive {}", input.display()))?;
        let mut archive = ZipArchive::new(file)
            .with_context(|| format!("failed to read OOXML archive {}", input.display()))?;
        let mut writer = ZipWriter::new(&mut *output).set_auto_large_file();
        let options = SimpleFileOptions::default();
        let mut removed_items = Vec::new();

        for index in 0..archive.len() {
            let mut entry = archive.by_index(index)?;
            let name = entry.name().to_string();

            if name.ends_with('/') {
                writer.add_directory_from_path(Path::new(&name), options)?;
                continue;
            }

            if is_metadata_file(&name) {
                let mut bytes = Vec::new();
                entry.read_to_end(&mut bytes)?;
                let mut element =
                    parse_xml(&bytes).with_context(|| format!("failed to parse {name}"))?;
                let removed_children = clear_metadata_children(&mut element);
                let mut buffer = Vec::new();
                element
                    .write(&mut buffer)
                    .with_context(|| format!("failed to write sanitized XML for {name}"))?;
                writer.start_file_from_path(Path::new(&name), options)?;
                writer.write_all(&buffer)?;
                if !removed_children.is_empty() {
                    removed_items.push(format!(
                        "{name}: cleared {} child nodes",
                        removed_children.len()
                    ));
                }
            } else {
                writer.raw_copy_file(entry)?;
            }
        }

        let output = writer.finish()?;
        output.flush()?;
        Ok(SanitizationPlan { removed_items })
    }
}

fn is_metadata_file(name: &str) -> bool {
    OOXML_METADATA_FILES
        .iter()
        .any(|candidate| candidate == &name)
}

fn parse_xml(bytes: &[u8]) -> Result<Element> {
    Element::parse(Cursor::new(bytes)).map_err(|error| anyhow!(error))
}

fn metadata_child_names(element: &Element) -> Vec<String> {
    element
        .children
        .iter()
        .filter_map(|node| match node {
            XMLNode::Element(child) => Some(child.name.clone()),
            _ => None,
        })
        .collect()
}

fn clear_metadata_children(element: &mut Element) -> Vec<String> {
    let removed = metadata_child_names(element);
    element.children.clear();
    removed
}