eclipse-sanitizer 0.1.0

A fast Rust CLI for sanitizing metadata from documents and images
use anyhow::{bail, Context, Result};
use std::fs::File;
use std::io::Write;
use std::path::Path;

use crate::models::{FileKind, SanitizationPlan};
use crate::sanitizers::Sanitizer;

const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";

pub struct PngSanitizer;

impl Sanitizer for PngSanitizer {
	fn kind(&self) -> FileKind {
		FileKind::Png
	}

	fn plan(&self, input: &Path) -> Result<SanitizationPlan> {
		let bytes = std::fs::read(input).with_context(|| format!("failed to read PNG {}", input.display()))?;
		let (_, removed_items) = strip_png_metadata(&bytes)?;
		Ok(SanitizationPlan { removed_items })
	}

	fn sanitize(&self, input: &Path, output: &mut File) -> Result<SanitizationPlan> {
		let bytes = std::fs::read(input).with_context(|| format!("failed to read PNG {}", input.display()))?;
		let (sanitized_bytes, removed_items) = strip_png_metadata(&bytes)?;
		output.write_all(&sanitized_bytes)?;
		output.flush()?;
		Ok(SanitizationPlan { removed_items })
	}
}

fn strip_png_metadata(bytes: &[u8]) -> Result<(Vec<u8>, Vec<String>)> {
	if bytes.len() < PNG_SIGNATURE.len() || &bytes[..8] != PNG_SIGNATURE {
		bail!("input is not a PNG file");
	}

	let mut cursor = PNG_SIGNATURE.len();
	let mut output = Vec::with_capacity(bytes.len());
	let mut removed_items = Vec::new();
	output.extend_from_slice(PNG_SIGNATURE);

	let mut saw_iend = false;

	while cursor + 12 <= bytes.len() {
		let length = u32::from_be_bytes([bytes[cursor], bytes[cursor + 1], bytes[cursor + 2], bytes[cursor + 3]]) as usize;
		let chunk_type = &bytes[cursor + 4..cursor + 8];
		let chunk_end = cursor + 12 + length;

		if chunk_end > bytes.len() {
			bail!("PNG chunk extends past end of file");
		}

		if is_metadata_chunk(chunk_type) {
			removed_items.push(String::from_utf8_lossy(chunk_type).to_string());
		} else {
			output.extend_from_slice(&bytes[cursor..chunk_end]);
		}

		if chunk_type == b"IEND" {
			saw_iend = true;
			output.extend_from_slice(&bytes[chunk_end..]);
			break;
		}

		cursor = chunk_end;
	}

	if !saw_iend {
		bail!("PNG file ended before IEND");
	}

	Ok((output, removed_items))
}

fn is_metadata_chunk(chunk_type: &[u8]) -> bool {
	matches!(chunk_type, b"tEXt" | b"zTXt" | b"iTXt" | b"eXIf" | b"tIME")
}

#[cfg(test)]
mod tests {
	use super::strip_png_metadata;

	fn chunk(chunk_type: &[u8; 4], payload: &[u8]) -> Vec<u8> {
		let length = (payload.len() as u32).to_be_bytes();
		let mut bytes = Vec::new();
		bytes.extend_from_slice(&length);
		bytes.extend_from_slice(chunk_type);
		bytes.extend_from_slice(payload);
		bytes.extend_from_slice(&[0, 0, 0, 0]);
		bytes
	}

	#[test]
	fn strips_png_text_chunks() {
		let mut input = Vec::new();
		input.extend_from_slice(b"\x89PNG\r\n\x1a\n");
		input.extend(chunk(b"IHDR", &[0; 13]));
		input.extend(chunk(b"tEXt", b"Comment\0secret"));
		input.extend(chunk(b"IDAT", b"compressed"));
		input.extend(chunk(b"IEND", &[]));

		let (output, removed) = strip_png_metadata(&input).expect("png should sanitize");

		assert!(output.windows(4).any(|window| window == b"IHDR"));
		assert!(output.windows(4).any(|window| window == b"IDAT"));
		assert!(output.windows(4).any(|window| window == b"IEND"));
		assert!(!output.windows(4).any(|window| window == b"tEXt"));
		assert!(removed.iter().any(|chunk| chunk == "tEXt"));
	}
}