eclipse-sanitizer 0.1.0

A fast Rust CLI for sanitizing metadata from documents and images
use anyhow::{anyhow, bail, Context, Result};
use indicatif::{ProgressBar, ProgressStyle};
use rayon::prelude::*;
use rayon::ThreadPool;
use rayon::ThreadPoolBuilder;
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::{
    atomic::{AtomicBool, Ordering},
    Arc,
};
use std::time::Duration;
use tempfile::NamedTempFile;
use tracing::{info, warn};
use walkdir::WalkDir;

use crate::audit;
use crate::cli::Cli;
use crate::hashing;
use crate::models::{DiscoveryResult, FileKind, FileReport, FileStatus, FileTask, RunSummary};
use crate::sanitizers;

pub fn run(cli: Cli) -> Result<()> {
    let input = canonicalize_existing(&cli.input).with_context(|| format!("failed to resolve {}", cli.input.display()))?;
    let input_is_directory = input.is_dir();

    let output_root = match cli.output {
        Some(ref output) => Some(canonicalize_or_create_dir(output)?),
        None => None,
    };

    if input_is_directory {
        if let Some(ref output_root) = output_root {
            if output_root == &input {
                bail!("output directory must not be the same as the input directory");
            }
        }
    }

    let log_root = audit_root(&input, input_is_directory, output_root.as_ref())?;
    let _audit_guard = audit::init(&log_root)?;

    let stop_flag = Arc::new(AtomicBool::new(false));
    install_ctrlc_handler(stop_flag.clone())?;

    let pool = build_thread_pool(cli.jobs)?;

    let discovery = discover_tasks(&input, input_is_directory, output_root.as_ref())?;
    if discovery.tasks.is_empty() {
        info!(input = %input.display(), "no supported files found");
        return Ok(());
    }

    let progress = configured_progress(discovery.tasks.len() as u64)?;
    let reports: Vec<FileReport> = run_parallel(pool.as_ref(), discovery.tasks, cli.dry_run, stop_flag.clone(), progress.clone());

    if stop_flag.load(Ordering::SeqCst) {
        progress.finish_with_message("interrupted");
    } else {
        progress.finish_with_message("complete");
    }

    render_reports(&reports, cli.dry_run);

    let summary = RunSummary::from_reports(&reports, discovery.skipped_unsupported);
    log_summary(&summary);

    if summary.interrupted {
        bail!("processing was interrupted");
    }

    if summary.failed > 0 {
        bail!("one or more files failed to sanitize");
    }

    Ok(())
}

fn run_parallel(pool: Option<&ThreadPool>, tasks: Vec<FileTask>, dry_run: bool, stop_flag: Arc<AtomicBool>, progress: ProgressBar) -> Vec<FileReport> {
    match pool {
        Some(pool) => pool.install(|| tasks.into_par_iter().map(|task| process_task(task, dry_run, stop_flag.clone(), progress.clone())).collect()),
        None => tasks.into_par_iter().map(|task| process_task(task, dry_run, stop_flag.clone(), progress.clone())).collect(),
    }
}

fn process_task(task: FileTask, dry_run: bool, stop_flag: Arc<AtomicBool>, progress: ProgressBar) -> FileReport {
    let source = task.source.clone();
    let destination = task.destination.clone();
    let kind = task.kind;

    let report = process_task_inner(task, dry_run, stop_flag);
    progress.inc(1);

    match report {
        Ok(report) => report,
        Err(error) => FileReport::failed(source, destination, kind, None, error.to_string()),
    }
}

fn process_task_inner(task: FileTask, dry_run: bool, stop_flag: Arc<AtomicBool>) -> Result<FileReport> {
    if stop_flag.load(Ordering::SeqCst) {
        return Ok(FileReport::interrupted(task.source, task.destination, task.kind, None));
    }

    let sanitizer = sanitizers::sanitizer_for(task.kind);
    debug_assert_eq!(sanitizer.kind(), task.kind);
    let original_hash = hashing::hash_file(&task.source)?;

    if dry_run {
        let plan = sanitizer.plan(&task.source)?;
        let report = FileReport::dry_run(task.source.clone(), task.destination.clone(), task.kind, original_hash, plan.removed_items.clone());
        log_file_report(&report);
        return Ok(report);
    }

    if let Some(parent) = task.destination.parent() {
        fs::create_dir_all(parent).with_context(|| format!("failed to create {}", parent.display()))?;
    }

    let temp_dir = task.destination.parent().map(Path::to_path_buf).unwrap_or_else(|| PathBuf::from("."));
    let mut temp_file = NamedTempFile::new_in(&temp_dir).with_context(|| format!("failed to create temp file in {}", temp_dir.display()))?;

    let plan = sanitizer.sanitize(&task.source, temp_file.as_file_mut())?;
    temp_file.as_file_mut().sync_all()?;

    let validation = sanitizer.plan(temp_file.path())?;
    if !validation.is_empty() {
        return Err(anyhow!("sanitized file still contains metadata: {}", validation.removed_items.join(", ")));
    }

    let sanitized_hash = hashing::hash_file(temp_file.path())?;

    if stop_flag.load(Ordering::SeqCst) {
        return Ok(FileReport::interrupted(task.source, task.destination, task.kind, Some(original_hash)));
    }

    temp_file
        .persist(&task.destination)
        .with_context(|| format!("failed to persist sanitized file to {}", task.destination.display()))?;

    let persisted_hash = hashing::hash_file(&task.destination)?;
    if persisted_hash != sanitized_hash {
        if task.destination != task.source {
            let _ = fs::remove_file(&task.destination);
        }

        return Err(anyhow!(
            "post-persist hash mismatch for {}: expected {}, got {}",
            task.destination.display(),
            sanitized_hash,
            persisted_hash
        ));
    }

    let report = FileReport::sanitized(
        task.source,
        task.destination,
        task.kind,
        original_hash,
        sanitized_hash,
        plan.removed_items,
    );

    log_file_report(&report);
    Ok(report)
}

fn discover_tasks(input: &Path, input_is_directory: bool, output_root: Option<&PathBuf>) -> Result<DiscoveryResult> {
    if !input_is_directory {
        let kind = FileKind::from_path(input).ok_or_else(|| {
            anyhow!(
                "unsupported file type for {}. Supported extensions: {}",
                input.display(),
                FileKind::supported_extensions().join(", ")
            )
        })?;

        let destination = output_root
            .and_then(|output_root| input.file_name().map(|file_name| output_root.join(file_name)))
            .unwrap_or_else(|| input.to_path_buf());

        return Ok(DiscoveryResult {
            tasks: vec![FileTask {
                source: input.to_path_buf(),
                destination,
                kind,
            }],
            skipped_unsupported: 0,
        });
    }

    let mut tasks = Vec::new();
    let mut skipped_unsupported = 0_usize;
    let skip_output_root = output_root.and_then(|candidate| if candidate.starts_with(input) { Some(candidate.clone()) } else { None });

    for entry in WalkDir::new(input).follow_links(false).sort_by_file_name() {
        let entry = match entry {
            Ok(entry) => entry,
            Err(error) => {
                warn!(error = %error, "skipping unreadable entry");
                skipped_unsupported += 1;
                continue;
            }
        };

        if !entry.file_type().is_file() {
            continue;
        }

        let source = entry.path();
        if source.file_name().and_then(|name| name.to_str()) == Some(".eclipse_audit.log") {
            continue;
        }

        if let Some(ref skip_root) = skip_output_root {
            if source.starts_with(skip_root) {
                continue;
            }
        }

        let kind = match FileKind::from_path(source) {
            Some(kind) => kind,
            None => {
                skipped_unsupported += 1;
                continue;
            }
        };

        let destination = if let Some(output_root) = output_root {
            output_root.join(source.strip_prefix(input).with_context(|| format!("failed to compute relative path for {}", source.display()))?)
        } else {
            source.to_path_buf()
        };

        tasks.push(FileTask {
            source: source.to_path_buf(),
            destination,
            kind,
        });
    }

    tasks.sort_by(|left, right| left.source.cmp(&right.source));

    Ok(DiscoveryResult {
        tasks,
        skipped_unsupported,
    })
}

fn configured_progress(total: u64) -> Result<ProgressBar> {
    let progress = ProgressBar::new(total);
    let style = ProgressStyle::with_template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} {msg}")?
        .progress_chars("=>-");
    progress.set_style(style);
    progress.enable_steady_tick(Duration::from_millis(100));
    Ok(progress)
}

fn render_reports(reports: &[FileReport], dry_run: bool) {
    for report in reports {
        println!("{}", format_report(report, dry_run));
    }
}

fn format_report(report: &FileReport, dry_run: bool) -> String {
    let path = report.source.display();
    match report.status {
        FileStatus::DryRun => {
            let removed = if report.removed_items.is_empty() {
                "no metadata markers found".to_string()
            } else {
                report.removed_items.join(", ")
            };
            format!("[dry-run] {} ({}) -> {}", path, report.kind.label(), removed)
        }
        FileStatus::Sanitized => {
            let removed = if report.removed_items.is_empty() {
                "no metadata markers found".to_string()
            } else {
                format!("{} metadata markers removed", report.removed_items.len())
            };
            if dry_run {
                format!("[dry-run] {} ({}) -> {}", path, report.kind.label(), removed)
            } else {
                format!("[ok] {} ({}) -> {}", path, report.kind.label(), removed)
            }
        }
        FileStatus::Failed => format!(
            "[error] {} ({}) -> {}",
            path,
            report.kind.label(),
            report.error.as_deref().unwrap_or("unknown error")
        ),
        FileStatus::Interrupted => format!("[interrupted] {} ({})", path, report.kind.label()),
    }
}

fn log_file_report(report: &FileReport) {
    let status = match report.status {
        FileStatus::DryRun => "dry_run",
        FileStatus::Sanitized => "sanitized",
        FileStatus::Failed => "failed",
        FileStatus::Interrupted => "interrupted",
    };

    info!(
        target: "eclipse::audit",
        timestamp = %chrono::Utc::now().to_rfc3339(),
        status = status,
        kind = report.kind.label(),
        source = %report.source.display(),
        destination = %report.destination.display(),
        original_hash = report.original_hash.as_deref().unwrap_or(""),
        sanitized_hash = report.sanitized_hash.as_deref().unwrap_or(""),
        removed_items = ?report.removed_items,
        error = report.error.as_deref().unwrap_or(""),
        "file processed"
    );
}

fn log_summary(summary: &RunSummary) {
    info!(
        target: "eclipse::audit",
        discovered = summary.discovered,
        skipped_unsupported = summary.skipped_unsupported,
        dry_run = summary.dry_run,
        sanitized = summary.sanitized,
        failed = summary.failed,
        interrupted = summary.interrupted,
        "run complete"
    );
}

fn install_ctrlc_handler(stop_flag: Arc<AtomicBool>) -> Result<()> {
    ctrlc::set_handler(move || {
        stop_flag.store(true, Ordering::SeqCst);
    })
    .context("failed to install CTRL+C handler")
}

fn build_thread_pool(jobs: Option<usize>) -> Result<Option<ThreadPool>> {
    match jobs {
        Some(0) => bail!("jobs must be at least 1"),
        Some(jobs) => Ok(Some(ThreadPoolBuilder::new().num_threads(jobs).build().context("failed to build rayon thread pool")?)),
        None => Ok(None),
    }
}

fn canonicalize_existing(path: &Path) -> Result<PathBuf> {
    fs::canonicalize(path).with_context(|| format!("{} does not exist", path.display()))
}

fn canonicalize_or_create_dir(path: &Path) -> Result<PathBuf> {
    if path.exists() {
        if !path.is_dir() {
            bail!("{} exists but is not a directory", path.display());
        }
    } else {
        fs::create_dir_all(path).with_context(|| format!("failed to create {}", path.display()))?;
    }

    fs::canonicalize(path).with_context(|| format!("failed to resolve {}", path.display()))
}

fn audit_root(input: &Path, input_is_directory: bool, output_root: Option<&PathBuf>) -> Result<PathBuf> {
    if let Some(output_root) = output_root {
        return Ok(output_root.clone());
    }

    if input_is_directory {
        return Ok(input.to_path_buf());
    }

    if let Some(parent) = input.parent() {
        return Ok(parent.to_path_buf());
    }

    Ok(std::env::current_dir().context("failed to resolve current directory")?)
}