eclipse-sanitizer 0.1.0

A fast Rust CLI for sanitizing metadata from documents and images
use anyhow::{Context, Result};
use lopdf::{Dictionary, Document, Object};
use std::collections::BTreeSet;
use std::fs::File;
use std::path::Path;

use crate::models::{FileKind, SanitizationPlan};
use crate::sanitizers::Sanitizer;

const PDF_METADATA_KEYS: &[&[u8]] = &[
    b"Author",
    b"CreationDate",
    b"Creator",
    b"Keywords",
    b"ModDate",
    b"Producer",
    b"Subject",
    b"Title",
    b"Trapped",
];

pub struct PdfSanitizer;

impl Sanitizer for PdfSanitizer {
    fn kind(&self) -> FileKind {
        FileKind::Pdf
    }

    fn plan(&self, input: &Path) -> Result<SanitizationPlan> {
        let mut document = Document::load(input).with_context(|| format!("failed to load PDF {}", input.display()))?;
        let removed_items = strip_pdf_metadata(&mut document)?;
        Ok(SanitizationPlan { removed_items })
    }

    fn sanitize(&self, input: &Path, output: &mut File) -> Result<SanitizationPlan> {
        let mut document = Document::load(input).with_context(|| format!("failed to load PDF {}", input.display()))?;
        let removed_items = strip_pdf_metadata(&mut document)?;
        document
            .save_to(output)
            .with_context(|| format!("failed to write sanitized PDF {}", input.display()))?;
        output.sync_all()?;
        Ok(SanitizationPlan { removed_items })
    }
}

fn strip_pdf_metadata(document: &mut Document) -> Result<Vec<String>> {
    let mut removed = BTreeSet::new();

    if let Some(info_object) = document.trailer.remove(b"Info") {
        match info_object {
            Object::Reference(info_id) => {
                let info_dictionary = document
                    .get_dictionary_mut(info_id)
                    .with_context(|| "PDF trailer /Info points to a missing dictionary")?;
                scrub_dictionary(info_dictionary, "trailer /Info", &mut removed);
            }
            Object::Dictionary(mut info_dictionary) => {
                scrub_dictionary(&mut info_dictionary, "trailer /Info", &mut removed);
            }
            _ => {}
        }

        removed.insert("trailer /Info".to_string());
    }

    if let Ok(catalog) = document.catalog_mut() {
        if catalog.remove(b"Metadata").is_some() {
            removed.insert("catalog /Metadata".to_string());
        }
    }

    document.prune_objects();

    Ok(removed.into_iter().collect())
}

fn scrub_dictionary(dictionary: &mut Dictionary, context: &str, removed: &mut BTreeSet<String>) {
    for key in PDF_METADATA_KEYS {
        if dictionary.remove(*key).is_some() {
            removed.insert(format!("{context} /{}", String::from_utf8_lossy(key)));
        }
    }

    if dictionary.remove(b"Metadata").is_some() {
        removed.insert(format!("{context} /Metadata"));
    }
}