eclipse-sanitizer 0.1.1

A fast Rust CLI for sanitizing metadata from documents and images
use anyhow::{Context, Result, bail};
use std::fs::File;
use std::io::Write;
use std::path::Path;

use crate::models::{FileKind, SanitizationPlan};
use crate::sanitizers::Sanitizer;

const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";

pub struct PngSanitizer;

impl Sanitizer for PngSanitizer {
    fn kind(&self) -> FileKind {
        FileKind::Png
    }

    fn plan(&self, input: &Path) -> Result<SanitizationPlan> {
        let bytes = std::fs::read(input)
            .with_context(|| format!("failed to read PNG {}", input.display()))?;
        let (_, removed_items) = strip_png_metadata(&bytes)?;
        Ok(SanitizationPlan { removed_items })
    }

    fn sanitize(&self, input: &Path, output: &mut File) -> Result<SanitizationPlan> {
        let bytes = std::fs::read(input)
            .with_context(|| format!("failed to read PNG {}", input.display()))?;
        let (sanitized_bytes, removed_items) = strip_png_metadata(&bytes)?;
        output.write_all(&sanitized_bytes)?;
        output.flush()?;
        Ok(SanitizationPlan { removed_items })
    }
}

fn strip_png_metadata(bytes: &[u8]) -> Result<(Vec<u8>, Vec<String>)> {
    if bytes.len() < PNG_SIGNATURE.len() || &bytes[..8] != PNG_SIGNATURE {
        bail!("input is not a PNG file");
    }

    let mut cursor = PNG_SIGNATURE.len();
    let mut output = Vec::with_capacity(bytes.len());
    let mut removed_items = Vec::new();
    output.extend_from_slice(PNG_SIGNATURE);

    let mut saw_iend = false;

    while cursor + 12 <= bytes.len() {
        let length = u32::from_be_bytes([
            bytes[cursor],
            bytes[cursor + 1],
            bytes[cursor + 2],
            bytes[cursor + 3],
        ]) as usize;
        let chunk_type = &bytes[cursor + 4..cursor + 8];
        let chunk_end = cursor + 12 + length;

        if chunk_end > bytes.len() {
            bail!("PNG chunk extends past end of file");
        }

        if is_metadata_chunk(chunk_type) {
            removed_items.push(String::from_utf8_lossy(chunk_type).to_string());
        } else {
            output.extend_from_slice(&bytes[cursor..chunk_end]);
        }

        if chunk_type == b"IEND" {
            saw_iend = true;
            output.extend_from_slice(&bytes[chunk_end..]);
            break;
        }

        cursor = chunk_end;
    }

    if !saw_iend {
        bail!("PNG file ended before IEND");
    }

    Ok((output, removed_items))
}

fn is_metadata_chunk(chunk_type: &[u8]) -> bool {
    matches!(chunk_type, b"tEXt" | b"zTXt" | b"iTXt" | b"eXIf" | b"tIME")
}

#[cfg(test)]
mod tests {
    use super::strip_png_metadata;

    fn chunk(chunk_type: &[u8; 4], payload: &[u8]) -> Vec<u8> {
        let length = (payload.len() as u32).to_be_bytes();
        let mut bytes = Vec::new();
        bytes.extend_from_slice(&length);
        bytes.extend_from_slice(chunk_type);
        bytes.extend_from_slice(payload);
        bytes.extend_from_slice(&[0, 0, 0, 0]);
        bytes
    }

    #[test]
    fn strips_png_text_chunks() {
        let mut input = Vec::new();
        input.extend_from_slice(b"\x89PNG\r\n\x1a\n");
        input.extend(chunk(b"IHDR", &[0; 13]));
        input.extend(chunk(b"tEXt", b"Comment\0secret"));
        input.extend(chunk(b"IDAT", b"compressed"));
        input.extend(chunk(b"IEND", &[]));

        let (output, removed) = strip_png_metadata(&input).expect("png should sanitize");

        assert!(output.windows(4).any(|window| window == b"IHDR"));
        assert!(output.windows(4).any(|window| window == b"IDAT"));
        assert!(output.windows(4).any(|window| window == b"IEND"));
        assert!(!output.windows(4).any(|window| window == b"tEXt"));
        assert!(removed.iter().any(|chunk| chunk == "tEXt"));
    }
}