eclipse-sanitizer 0.1.0

A fast Rust CLI for sanitizing metadata from documents and images
use anyhow::{bail, Context, Result};
use std::fs::File;
use std::io::Write;
use std::path::Path;

use crate::models::{FileKind, SanitizationPlan};
use crate::sanitizers::Sanitizer;

pub struct JpegSanitizer;

impl Sanitizer for JpegSanitizer {
    fn kind(&self) -> FileKind {
        FileKind::Jpeg
    }

    fn plan(&self, input: &Path) -> Result<SanitizationPlan> {
        let bytes = std::fs::read(input).with_context(|| format!("failed to read JPEG {}", input.display()))?;
        let (_, removed_items) = strip_jpeg_metadata(&bytes)?;
        Ok(SanitizationPlan { removed_items })
    }

    fn sanitize(&self, input: &Path, output: &mut File) -> Result<SanitizationPlan> {
        let bytes = std::fs::read(input).with_context(|| format!("failed to read JPEG {}", input.display()))?;
        let (sanitized_bytes, removed_items) = strip_jpeg_metadata(&bytes)?;
        output.write_all(&sanitized_bytes)?;
        output.flush()?;
        Ok(SanitizationPlan { removed_items })
    }
}

fn strip_jpeg_metadata(bytes: &[u8]) -> Result<(Vec<u8>, Vec<String>)> {
    if bytes.len() < 4 || bytes[0] != 0xFF || bytes[1] != 0xD8 {
        bail!("input is not a JPEG file");
    }

    let mut output = Vec::with_capacity(bytes.len());
    let mut removed_items = Vec::new();
    output.extend_from_slice(&bytes[..2]);

    let mut cursor = 2_usize;
    let mut saw_end_marker = false;

    while cursor < bytes.len() {
        let marker_start = cursor;
        while cursor < bytes.len() && bytes[cursor] == 0xFF {
            cursor += 1;
        }

        if cursor >= bytes.len() {
            break;
        }

        if marker_start == cursor {
            bail!("unexpected JPEG data outside of a marker segment");
        }

        let marker = bytes[cursor];
        cursor += 1;

        match marker {
            0xD9 => {
                output.extend_from_slice(&bytes[marker_start..cursor]);
                output.extend_from_slice(&bytes[cursor..]);
                saw_end_marker = true;
                break;
            }
            0xDA => {
                let end = consume_segment_end(bytes, &mut cursor)?;
                output.extend_from_slice(&bytes[marker_start..end]);
                output.extend_from_slice(&bytes[end..]);
                saw_end_marker = true;
                break;
            }
            0x01 | 0xD0..=0xD7 => {
                output.extend_from_slice(&bytes[marker_start..cursor]);
            }
            _ => {
                let end = consume_segment_end(bytes, &mut cursor)?;
                if let Some(label) = metadata_label(marker, &bytes[marker_start..end]) {
                    removed_items.push(label);
                } else {
                    output.extend_from_slice(&bytes[marker_start..end]);
                }
            }
        }
    }

    if !saw_end_marker {
        bail!("JPEG file ended before an end-of-image marker was found");
    }

    if !output.windows(2).any(|window| window == [0xFF, 0xD9]) {
        bail!("sanitized JPEG is missing an end-of-image marker");
    }

    Ok((output, removed_items))
}

fn consume_segment_end(bytes: &[u8], cursor: &mut usize) -> Result<usize> {
    if *cursor + 2 > bytes.len() {
        bail!("unexpected end of JPEG segment");
    }

    let length = u16::from_be_bytes([bytes[*cursor], bytes[*cursor + 1]]) as usize;
    if length < 2 {
        bail!("JPEG segment length is invalid");
    }

    let segment_end = *cursor + length;
    if segment_end > bytes.len() {
        bail!("JPEG segment extends past the end of the file");
    }

    *cursor = segment_end;
    Ok(segment_end)
}

fn metadata_label(marker: u8, segment: &[u8]) -> Option<String> {
    let payload = if segment.len() > 4 { &segment[4..] } else { &[] };

    match marker {
        0xE1 => {
            if payload.starts_with(b"Exif\0\0") {
                Some("APP1/EXIF".to_string())
            } else if payload.starts_with(b"http://ns.adobe.com/xap/1.0/\0") {
                Some("APP1/XMP".to_string())
            } else {
                Some("APP1 metadata".to_string())
            }
        }
        0xED => Some("APP13/IPTC-Photoshop".to_string()),
        0xFE => Some("JPEG comment".to_string()),
        _ => None,
    }
}

#[cfg(test)]
mod tests {
    use super::strip_jpeg_metadata;

    fn segment(marker: u8, payload: &[u8]) -> Vec<u8> {
        let length = (payload.len() + 2) as u16;
        let mut bytes = vec![0xFF, marker, (length >> 8) as u8, (length & 0xFF) as u8];
        bytes.extend_from_slice(payload);
        bytes
    }

    #[test]
    fn strips_metadata_segments_but_keeps_jpeg_structure() {
        let mut input = vec![0xFF, 0xD8];
        input.extend(segment(0xE0, b"JFIF\0\x01\x02"));
        input.extend(segment(0xE1, b"Exif\0\0metadata"));
        input.extend(segment(0xFE, b"comment"));
        input.extend_from_slice(&[0xFF, 0xD9]);

        let (output, removed) = strip_jpeg_metadata(&input).expect("jpeg should sanitize");

        assert!(output.starts_with(&[0xFF, 0xD8]));
        assert!(output.windows(2).any(|window| window == [0xFF, 0xD9]));
        assert!(output.windows(4).any(|window| window == [0xFF, 0xE0, 0x00, 0x09]));
        assert!(!output.windows(2).any(|window| window == [0xFF, 0xE1]));
        assert!(!output.windows(2).any(|window| window == [0xFF, 0xFE]));
        assert!(removed.iter().any(|item| item == "APP1/EXIF"));
        assert!(removed.iter().any(|item| item == "JPEG comment"));
    }

    #[test]
    fn rejects_non_jpeg_input() {
        let error = strip_jpeg_metadata(b"not a jpeg").expect_err("should reject invalid input");
        assert!(error.to_string().contains("input is not a JPEG file"));
    }
}