simdutf8-cli 0.2.7

// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: 2025,2026 ndaal Gesellschaft für Sicherheit in der Informationstechnik mbH & Co KG, Cologne
// SPDX-FileCopyrightText: Author: Pierre Gronau <Pierre.Gronau@ndaal.eu>

//! Command-line interface: argument definitions and the [`run`] entry point.
//!
//! [`run`] is generic over its output and error sinks so it can be driven both
//! by the real binary (locked stdout/stderr) and by tests (in-memory buffers).
//! It returns the process exit code:
//!
//! | code | meaning                                             |
//! |------|-----------------------------------------------------|
//! | `0`  | every input was valid UTF-8                          |
//! | `1`  | every input was readable, but at least one was invalid |
//! | `2`  | at least one input could not be read securely       |

use std::ffi::OsStr;
use std::io::Write;
use std::path::{Path, PathBuf};

use clap::Parser;
use ignore::overrides::OverrideBuilder;
use ignore::{Walk, WalkBuilder};

use crate::path_security::{
    read_capped, write_in_dir, PathPolicy, PathSecurityError, DEFAULT_MAX_FILE_SIZE,
};
use crate::report::{
    build_sarif, json_block, sarif_to_markdown, text_block, Finding, OutputFormat, ReportError,
};
use crate::validate::validate;

/// Validate whether files (or standard input) contain well-formed UTF-8, fast.
#[derive(Debug, Parser)]
#[command(name = "simdutf8-cli", version, about, long_about = None)]
pub struct Args {
    /// Files or directories to validate. Directories are walked recursively.
    /// Use `-` or pass none to read standard input.
    #[arg(value_name = "PATH")]
    pub files: Vec<PathBuf>,

    /// Exclude paths matching this glob when walking directories (repeatable,
    /// gitignore syntax, e.g. `--exclude '*.min.js' --exclude target`).
    #[arg(long, value_name = "GLOB")]
    pub exclude: Vec<String>,

    /// When walking directories, do not respect `.gitignore` / `.ignore` files
    /// (they are respected by default).
    #[arg(long)]
    pub no_ignore: bool,

    /// When walking directories, also descend into hidden files and directories
    /// (skipped by default).
    #[arg(long)]
    pub hidden: bool,

    /// Confine inputs to this base directory (rejects traversal & symlink escapes).
    #[arg(long, value_name = "DIR")]
    pub base_dir: Option<PathBuf>,

    /// Maximum number of bytes to read from each input.
    #[arg(long, value_name = "BYTES", default_value_t = DEFAULT_MAX_FILE_SIZE)]
    pub max_size: u64,

    /// Reject symbolic links instead of resolving them.
    #[arg(long)]
    pub no_follow_symlinks: bool,

    /// Output format for stdout.
    #[arg(long, value_enum, default_value_t = OutputFormat::Text)]
    pub format: OutputFormat,

    /// Suppress per-input output and rely on the exit code only.
    #[arg(short, long)]
    pub quiet: bool,

    /// Directory for the auto-generated `report.sarif` / `report.md` files.
    #[arg(long, value_name = "DIR", default_value = ".")]
    pub output_dir: PathBuf,

    /// Do not auto-generate the `report.sarif` / `report.md` files.
    #[arg(long)]
    pub no_report: bool,
}

/// Run the validation described by `args`, writing results to `out` and
/// diagnostics to `err`. Returns the process exit code.
///
/// # Errors
///
/// Returns an error only if writing to `out` itself fails; per-input failures
/// are reported via `err` and reflected in the returned exit code.
pub fn run<O: Write, E: Write>(args: &Args, out: &mut O, err: &mut E) -> std::io::Result<u8> {
    let policy = build_policy(args);
    let mut state = RunState::default();

    state.collect_inputs(args, &policy, err)?; // read + validate every input
    if !args.quiet {
        state.emit_stdout(args, out, err)?; // render the chosen format
    }
    if !args.no_report {
        state.emit_reports(args, err)?; // auto-write report.sarif / report.md
    }

    Ok(state.exit_code())
}

/// Build the [`PathPolicy`] implied by the command-line arguments.
fn build_policy(args: &Args) -> PathPolicy {
    let mut policy = PathPolicy::new()
        .max_file_size(args.max_size)
        .allow_symlinks(!args.no_follow_symlinks);
    if let Some(base) = &args.base_dir {
        policy = policy.base_dir(base.clone());
    }
    policy
}

/// Classify a path as a directory (for the walk decision). A path that cannot be
/// stat-ed is treated as a non-directory so its real error surfaces from `read`.
fn is_dir(path: &Path) -> bool {
    std::fs::metadata(path).is_ok_and(|meta| meta.is_dir())
}

/// Build a directory walker honouring `--exclude`, ignore-file rules, and the
/// hidden-file policy. `.gitignore` is respected even outside a git repository
/// (`require_git(false)`); symlinked directories are never followed.
fn build_walker(dir: &Path, args: &Args) -> Result<Walk, ignore::Error> {
    let mut overrides = OverrideBuilder::new(dir);
    for pattern in &args.exclude {
        // A leading `!` makes an override glob an *exclude* (ripgrep semantics).
        overrides.add(&format!("!{pattern}"))?;
    }
    let respect_ignores = !args.no_ignore;

    let mut builder = WalkBuilder::new(dir);
    builder
        .overrides(overrides.build()?)
        .hidden(!args.hidden)
        .git_ignore(respect_ignores)
        .git_global(respect_ignores)
        .git_exclude(respect_ignores)
        .ignore(respect_ignores)
        .parents(respect_ignores)
        .require_git(false)
        .follow_links(false);
    Ok(builder.build())
}

/// Read standard input, hard-capped at `limit` bytes.
fn read_stdin(limit: u64) -> Result<Vec<u8>, PathSecurityError> {
    let stdin = std::io::stdin();
    read_capped(stdin.lock(), limit)
}

/// Render the findings to a string in the requested stdout format. Each format
/// ends with a trailing newline.
fn render_stdout(
    format: OutputFormat,
    findings: &[Finding],
) -> std::result::Result<String, ReportError> {
    match format {
        OutputFormat::Text => Ok(text_block(findings)),
        OutputFormat::Json => Ok(json_block(findings)),
        OutputFormat::Sarif => {
            let mut sarif = build_sarif(findings)?;
            sarif.push('\n');
            Ok(sarif)
        },
        OutputFormat::Markdown => {
            let sarif = build_sarif(findings)?;
            let mut markdown = sarif_to_markdown(&sarif)?;
            if !markdown.ends_with('\n') {
                markdown.push('\n');
            }
            Ok(markdown)
        },
    }
}

/// Build, strict-validate, and write `report.sarif` + `report.md` into
/// `output_dir` via a capability-scoped directory handle.
fn write_reports(output_dir: &Path, findings: &[Finding]) -> std::result::Result<(), ReportError> {
    let sarif = build_sarif(findings)?;
    let markdown = sarif_to_markdown(&sarif)?;
    write_in_dir(output_dir, "report.sarif", sarif.as_bytes())
        .map_err(|error| ReportError::Sarif(error.to_string()))?;
    write_in_dir(output_dir, "report.md", markdown.as_bytes())
        .map_err(|error| ReportError::Markdown(error.to_string()))?;
    Ok(())
}

/// Mutable bookkeeping shared while processing the list of inputs.
#[derive(Default)]
struct RunState {
    any_invalid: bool,
    any_error: bool,
    findings: Vec<Finding>,
}

impl RunState {
    /// Phase 1: read + validate every input (files, `-`, or stdin) into findings.
    fn collect_inputs<E: Write>(
        &mut self,
        args: &Args,
        policy: &PathPolicy,
        err: &mut E,
    ) -> std::io::Result<()> {
        if args.files.is_empty() {
            return self.record("<stdin>", read_stdin(args.max_size), err);
        }
        for file in &args.files {
            if file.as_os_str() == OsStr::new("-") {
                self.record("<stdin>", read_stdin(args.max_size), err)?;
            } else if is_dir(file) {
                self.walk_dir(file, args, policy, err)?;
            } else {
                // An explicit file (or a non-existent path, whose error surfaces
                // from `read`) is always validated — ignore rules apply only to
                // directory walking.
                let label = file.display().to_string();
                self.record(&label, policy.read(file), err)?;
            }
        }
        Ok(())
    }

    /// Recursively validate every regular file under `dir`, honouring
    /// `--exclude`, `.gitignore`/`.ignore` (unless `--no-ignore`), and hidden-file
    /// skipping (unless `--hidden`). Each file is still read via [`PathPolicy`].
    fn walk_dir<E: Write>(
        &mut self,
        dir: &Path,
        args: &Args,
        policy: &PathPolicy,
        err: &mut E,
    ) -> std::io::Result<()> {
        let walker = match build_walker(dir, args) {
            Ok(walker) => walker,
            Err(error) => {
                self.any_error = true;
                writeln!(err, "error: {}: {error}", dir.display())?;
                return Ok(());
            },
        };
        for entry in walker {
            match entry {
                Ok(entry) if entry.file_type().is_some_and(|ft| ft.is_file()) => {
                    let path = entry.path();
                    let label = path.display().to_string();
                    self.record(&label, policy.read(path), err)?;
                },
                Ok(_) => {}, // a directory or non-regular entry — skip
                Err(error) => {
                    self.any_error = true;
                    writeln!(err, "error: walking {}: {error}", dir.display())?;
                },
            }
        }
        Ok(())
    }

    /// Phase 2: render the findings in the selected format to `out`.
    fn emit_stdout<O: Write, E: Write>(
        &mut self,
        args: &Args,
        out: &mut O,
        err: &mut E,
    ) -> std::io::Result<()> {
        match render_stdout(args.format, &self.findings) {
            Ok(rendered) => write!(out, "{rendered}"),
            Err(report_error) => {
                self.any_error = true;
                writeln!(err, "error: {report_error}")
            },
        }
    }

    /// Phase 3: auto-write `report.sarif` + `report.md` unless there is nothing
    /// to report.
    fn emit_reports<E: Write>(&mut self, args: &Args, err: &mut E) -> std::io::Result<()> {
        if self.findings.is_empty() {
            return Ok(());
        }
        if let Err(report_error) = write_reports(&args.output_dir, &self.findings) {
            self.any_error = true;
            writeln!(
                err,
                "error: writing reports to {}: {report_error}",
                args.output_dir.display()
            )?;
        }
        Ok(())
    }

    /// Validate one input's bytes (or report its read error) and accumulate the
    /// resulting [`Finding`].
    fn record<E: Write>(
        &mut self,
        label: &str,
        bytes: Result<Vec<u8>, PathSecurityError>,
        err: &mut E,
    ) -> std::io::Result<()> {
        match bytes {
            Ok(bytes) => {
                let verdict = validate(&bytes);
                if !verdict.is_valid() {
                    self.any_invalid = true;
                }
                self.findings.push(Finding {
                    label: label.to_owned(),
                    validity: verdict,
                });
            },
            Err(error) => {
                self.any_error = true;
                writeln!(err, "error: {label}: {error}")?;
            },
        }
        Ok(())
    }

    /// Final exit code: `2` on any read/report error, else `1` on any invalid
    /// input, else `0`.
    fn exit_code(&self) -> u8 {
        if self.any_error {
            2
        } else {
            u8::from(self.any_invalid)
        }
    }
}

#[cfg(test)]
mod tests {
    use std::io::Write as _;

    use super::*;

    fn temp_file(name: &str, bytes: &[u8]) -> (tempfile::TempDir, PathBuf) {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join(name);
        let mut f = std::fs::File::create(&path).unwrap();
        f.write_all(bytes).unwrap();
        (dir, path)
    }

    fn args_from(items: &[&str]) -> Args {
        // `--no-report` keeps unit tests from writing report files into the cwd;
        // the report pipeline is covered by the integration tests.
        let mut argv = vec!["simdutf8-cli", "--no-report"];
        argv.extend_from_slice(items);
        Args::try_parse_from(argv).expect("args should parse")
    }

    #[test]
    fn reports_valid_file_with_exit_zero() {
        let (_dir, path) = temp_file("ok.txt", "héllo".as_bytes());
        let args = args_from(&[path.to_str().unwrap()]);
        let mut out = Vec::new();
        let mut err = Vec::new();
        let code = run(&args, &mut out, &mut err).unwrap();
        assert_eq!(code, 0);
        assert!(String::from_utf8_lossy(&out).contains("OK"));
    }

    #[test]
    fn reports_invalid_file_with_exit_one() {
        let (_dir, path) = temp_file("bad.bin", b"a\xFFb");
        let args = args_from(&[path.to_str().unwrap()]);
        let mut out = Vec::new();
        let mut err = Vec::new();
        let code = run(&args, &mut out, &mut err).unwrap();
        assert_eq!(code, 1);
        assert!(String::from_utf8_lossy(&out).contains("FAIL"));
    }

    #[test]
    fn json_format_emits_an_array() {
        let (_dir, path) = temp_file("ok.txt", b"hi");
        let args = args_from(&["--format", "json", path.to_str().unwrap()]);
        let mut out = Vec::new();
        let mut err = Vec::new();
        let code = run(&args, &mut out, &mut err).unwrap();
        assert_eq!(code, 0);
        let s = String::from_utf8(out).unwrap();
        assert!(s.trim_start().starts_with('['), "got: {s}");
        assert!(s.contains(r#""valid":true"#), "got: {s}");
    }

    #[test]
    fn quiet_suppresses_stdout() {
        let (_dir, path) = temp_file("ok.txt", b"hi");
        let args = args_from(&["-q", path.to_str().unwrap()]);
        let mut out = Vec::new();
        let mut err = Vec::new();
        let code = run(&args, &mut out, &mut err).unwrap();
        assert_eq!(code, 0);
        assert!(out.is_empty(), "expected no output, got: {out:?}");
    }

    #[test]
    fn missing_file_yields_exit_two() {
        let dir = tempfile::tempdir().unwrap();
        let missing = dir.path().join("does-not-exist");
        let args = args_from(&[missing.to_str().unwrap()]);
        let mut out = Vec::new();
        let mut err = Vec::new();
        let code = run(&args, &mut out, &mut err).unwrap();
        assert_eq!(code, 2);
        assert!(String::from_utf8_lossy(&err).contains("error"));
    }

    #[test]
    fn base_dir_blocks_files_outside_it() {
        let base = tempfile::tempdir().unwrap();
        let (_outside_dir, outside) = temp_file("secret.txt", b"data");
        let args = args_from(&[
            "--base-dir",
            base.path().to_str().unwrap(),
            outside.to_str().unwrap(),
        ]);
        let mut out = Vec::new();
        let mut err = Vec::new();
        let code = run(&args, &mut out, &mut err).unwrap();
        assert_eq!(code, 2);
    }
}