use anyhow::{Context, Result};
use lopdf::{Dictionary, Document, Object};
use std::collections::BTreeSet;
use std::fs::File;
use std::path::Path;
use crate::models::{FileKind, SanitizationPlan};
use crate::sanitizers::Sanitizer;
const PDF_METADATA_KEYS: &[&[u8]] = &[
b"Author",
b"CreationDate",
b"Creator",
b"Keywords",
b"ModDate",
b"Producer",
b"Subject",
b"Title",
b"Trapped",
];
pub struct PdfSanitizer;
impl Sanitizer for PdfSanitizer {
fn kind(&self) -> FileKind {
FileKind::Pdf
}
fn plan(&self, input: &Path) -> Result<SanitizationPlan> {
let mut document = Document::load(input)
.with_context(|| format!("failed to load PDF {}", input.display()))?;
let removed_items = strip_pdf_metadata(&mut document)?;
Ok(SanitizationPlan { removed_items })
}
fn sanitize(&self, input: &Path, output: &mut File) -> Result<SanitizationPlan> {
let mut document = Document::load(input)
.with_context(|| format!("failed to load PDF {}", input.display()))?;
let removed_items = strip_pdf_metadata(&mut document)?;
document
.save_to(output)
.with_context(|| format!("failed to write sanitized PDF {}", input.display()))?;
output.sync_all()?;
Ok(SanitizationPlan { removed_items })
}
}
fn strip_pdf_metadata(document: &mut Document) -> Result<Vec<String>> {
let mut removed = BTreeSet::new();
if let Some(info_object) = document.trailer.remove(b"Info") {
match info_object {
Object::Reference(info_id) => {
let info_dictionary = document
.get_dictionary_mut(info_id)
.with_context(|| "PDF trailer /Info points to a missing dictionary")?;
scrub_dictionary(info_dictionary, "trailer /Info", &mut removed);
}
Object::Dictionary(mut info_dictionary) => {
scrub_dictionary(&mut info_dictionary, "trailer /Info", &mut removed);
}
_ => {}
}
removed.insert("trailer /Info".to_string());
}
if document
.catalog_mut()
.map(|catalog| catalog.remove(b"Metadata").is_some())
.unwrap_or(false)
{
removed.insert("catalog /Metadata".to_string());
}
document.prune_objects();
Ok(removed.into_iter().collect())
}
fn scrub_dictionary(dictionary: &mut Dictionary, context: &str, removed: &mut BTreeSet<String>) {
for key in PDF_METADATA_KEYS {
if dictionary.remove(key).is_some() {
removed.insert(format!("{context} /{}", String::from_utf8_lossy(key)));
}
}
if dictionary.remove(b"Metadata").is_some() {
removed.insert(format!("{context} /Metadata"));
}
}