simdutf8_cli/
cli.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: 2025,2026 ndaal Gesellschaft für Sicherheit in der Informationstechnik mbH & Co KG, Cologne
3// SPDX-FileCopyrightText: Author: Pierre Gronau <Pierre.Gronau@ndaal.eu>
4
5//! Command-line interface: argument definitions and the [`run`] entry point.
6//!
7//! [`run`] is generic over its output and error sinks so it can be driven both
8//! by the real binary (locked stdout/stderr) and by tests (in-memory buffers).
9//! It returns the process exit code:
10//!
11//! | code | meaning                                             |
12//! |------|-----------------------------------------------------|
13//! | `0`  | every input was valid UTF-8                          |
14//! | `1`  | every input was readable, but at least one was invalid |
15//! | `2`  | at least one input could not be read securely       |
16
17use std::ffi::OsStr;
18use std::io::Write;
19use std::path::{Path, PathBuf};
20
21use clap::Parser;
22use ignore::overrides::OverrideBuilder;
23use ignore::{Walk, WalkBuilder};
24
25use crate::path_security::{
26    read_capped, write_in_dir, PathPolicy, PathSecurityError, DEFAULT_MAX_FILE_SIZE,
27};
28use crate::report::{
29    build_sarif, json_block, sarif_to_markdown, text_block, Finding, OutputFormat, ReportError,
30};
31use crate::validate::validate;
32
33/// Validate whether files (or standard input) contain well-formed UTF-8, fast.
34#[derive(Debug, Parser)]
35#[command(name = "simdutf8-cli", version, about, long_about = None)]
36pub struct Args {
37    /// Files or directories to validate. Directories are walked recursively.
38    /// Use `-` or pass none to read standard input.
39    #[arg(value_name = "PATH")]
40    pub files: Vec<PathBuf>,
41
42    /// Exclude paths matching this glob when walking directories (repeatable,
43    /// gitignore syntax, e.g. `--exclude '*.min.js' --exclude target`).
44    #[arg(long, value_name = "GLOB")]
45    pub exclude: Vec<String>,
46
47    /// When walking directories, do not respect `.gitignore` / `.ignore` files
48    /// (they are respected by default).
49    #[arg(long)]
50    pub no_ignore: bool,
51
52    /// When walking directories, also descend into hidden files and directories
53    /// (skipped by default).
54    #[arg(long)]
55    pub hidden: bool,
56
57    /// Confine inputs to this base directory (rejects traversal & symlink escapes).
58    #[arg(long, value_name = "DIR")]
59    pub base_dir: Option<PathBuf>,
60
61    /// Maximum number of bytes to read from each input.
62    #[arg(long, value_name = "BYTES", default_value_t = DEFAULT_MAX_FILE_SIZE)]
63    pub max_size: u64,
64
65    /// Reject symbolic links instead of resolving them.
66    #[arg(long)]
67    pub no_follow_symlinks: bool,
68
69    /// Output format for stdout.
70    #[arg(long, value_enum, default_value_t = OutputFormat::Text)]
71    pub format: OutputFormat,
72
73    /// Suppress per-input output and rely on the exit code only.
74    #[arg(short, long)]
75    pub quiet: bool,
76
77    /// Directory for the auto-generated `report.sarif` / `report.md` files.
78    #[arg(long, value_name = "DIR", default_value = ".")]
79    pub output_dir: PathBuf,
80
81    /// Do not auto-generate the `report.sarif` / `report.md` files.
82    #[arg(long)]
83    pub no_report: bool,
84}
85
86/// Run the validation described by `args`, writing results to `out` and
87/// diagnostics to `err`. Returns the process exit code.
88///
89/// # Errors
90///
91/// Returns an error only if writing to `out` itself fails; per-input failures
92/// are reported via `err` and reflected in the returned exit code.
93pub fn run<O: Write, E: Write>(args: &Args, out: &mut O, err: &mut E) -> std::io::Result<u8> {
94    let policy = build_policy(args);
95    let mut state = RunState::default();
96
97    state.collect_inputs(args, &policy, err)?; // read + validate every input
98    if !args.quiet {
99        state.emit_stdout(args, out, err)?; // render the chosen format
100    }
101    if !args.no_report {
102        state.emit_reports(args, err)?; // auto-write report.sarif / report.md
103    }
104
105    Ok(state.exit_code())
106}
107
108/// Build the [`PathPolicy`] implied by the command-line arguments.
109fn build_policy(args: &Args) -> PathPolicy {
110    let mut policy = PathPolicy::new()
111        .max_file_size(args.max_size)
112        .allow_symlinks(!args.no_follow_symlinks);
113    if let Some(base) = &args.base_dir {
114        policy = policy.base_dir(base.clone());
115    }
116    policy
117}
118
119/// Classify a path as a directory (for the walk decision). A path that cannot be
120/// stat-ed is treated as a non-directory so its real error surfaces from `read`.
121fn is_dir(path: &Path) -> bool {
122    std::fs::metadata(path).is_ok_and(|meta| meta.is_dir())
123}
124
125/// Build a directory walker honouring `--exclude`, ignore-file rules, and the
126/// hidden-file policy. `.gitignore` is respected even outside a git repository
127/// (`require_git(false)`); symlinked directories are never followed.
128fn build_walker(dir: &Path, args: &Args) -> Result<Walk, ignore::Error> {
129    let mut overrides = OverrideBuilder::new(dir);
130    for pattern in &args.exclude {
131        // A leading `!` makes an override glob an *exclude* (ripgrep semantics).
132        overrides.add(&format!("!{pattern}"))?;
133    }
134    let respect_ignores = !args.no_ignore;
135
136    let mut builder = WalkBuilder::new(dir);
137    builder
138        .overrides(overrides.build()?)
139        .hidden(!args.hidden)
140        .git_ignore(respect_ignores)
141        .git_global(respect_ignores)
142        .git_exclude(respect_ignores)
143        .ignore(respect_ignores)
144        .parents(respect_ignores)
145        .require_git(false)
146        .follow_links(false);
147    Ok(builder.build())
148}
149
150/// Read standard input, hard-capped at `limit` bytes.
151fn read_stdin(limit: u64) -> Result<Vec<u8>, PathSecurityError> {
152    let stdin = std::io::stdin();
153    read_capped(stdin.lock(), limit)
154}
155
156/// Render the findings to a string in the requested stdout format. Each format
157/// ends with a trailing newline.
158fn render_stdout(
159    format: OutputFormat,
160    findings: &[Finding],
161) -> std::result::Result<String, ReportError> {
162    match format {
163        OutputFormat::Text => Ok(text_block(findings)),
164        OutputFormat::Json => Ok(json_block(findings)),
165        OutputFormat::Sarif => {
166            let mut sarif = build_sarif(findings)?;
167            sarif.push('\n');
168            Ok(sarif)
169        },
170        OutputFormat::Markdown => {
171            let sarif = build_sarif(findings)?;
172            let mut markdown = sarif_to_markdown(&sarif)?;
173            if !markdown.ends_with('\n') {
174                markdown.push('\n');
175            }
176            Ok(markdown)
177        },
178    }
179}
180
181/// Build, strict-validate, and write `report.sarif` + `report.md` into
182/// `output_dir` via a capability-scoped directory handle.
183fn write_reports(output_dir: &Path, findings: &[Finding]) -> std::result::Result<(), ReportError> {
184    let sarif = build_sarif(findings)?;
185    let markdown = sarif_to_markdown(&sarif)?;
186    write_in_dir(output_dir, "report.sarif", sarif.as_bytes())
187        .map_err(|error| ReportError::Sarif(error.to_string()))?;
188    write_in_dir(output_dir, "report.md", markdown.as_bytes())
189        .map_err(|error| ReportError::Markdown(error.to_string()))?;
190    Ok(())
191}
192
193/// Mutable bookkeeping shared while processing the list of inputs.
194#[derive(Default)]
195struct RunState {
196    any_invalid: bool,
197    any_error: bool,
198    findings: Vec<Finding>,
199}
200
201impl RunState {
202    /// Phase 1: read + validate every input (files, `-`, or stdin) into findings.
203    fn collect_inputs<E: Write>(
204        &mut self,
205        args: &Args,
206        policy: &PathPolicy,
207        err: &mut E,
208    ) -> std::io::Result<()> {
209        if args.files.is_empty() {
210            return self.record("<stdin>", read_stdin(args.max_size), err);
211        }
212        for file in &args.files {
213            if file.as_os_str() == OsStr::new("-") {
214                self.record("<stdin>", read_stdin(args.max_size), err)?;
215            } else if is_dir(file) {
216                self.walk_dir(file, args, policy, err)?;
217            } else {
218                // An explicit file (or a non-existent path, whose error surfaces
219                // from `read`) is always validated — ignore rules apply only to
220                // directory walking.
221                let label = file.display().to_string();
222                self.record(&label, policy.read(file), err)?;
223            }
224        }
225        Ok(())
226    }
227
228    /// Recursively validate every regular file under `dir`, honouring
229    /// `--exclude`, `.gitignore`/`.ignore` (unless `--no-ignore`), and hidden-file
230    /// skipping (unless `--hidden`). Each file is still read via [`PathPolicy`].
231    fn walk_dir<E: Write>(
232        &mut self,
233        dir: &Path,
234        args: &Args,
235        policy: &PathPolicy,
236        err: &mut E,
237    ) -> std::io::Result<()> {
238        let walker = match build_walker(dir, args) {
239            Ok(walker) => walker,
240            Err(error) => {
241                self.any_error = true;
242                writeln!(err, "error: {}: {error}", dir.display())?;
243                return Ok(());
244            },
245        };
246        for entry in walker {
247            match entry {
248                Ok(entry) if entry.file_type().is_some_and(|ft| ft.is_file()) => {
249                    let path = entry.path();
250                    let label = path.display().to_string();
251                    self.record(&label, policy.read(path), err)?;
252                },
253                Ok(_) => {}, // a directory or non-regular entry — skip
254                Err(error) => {
255                    self.any_error = true;
256                    writeln!(err, "error: walking {}: {error}", dir.display())?;
257                },
258            }
259        }
260        Ok(())
261    }
262
263    /// Phase 2: render the findings in the selected format to `out`.
264    fn emit_stdout<O: Write, E: Write>(
265        &mut self,
266        args: &Args,
267        out: &mut O,
268        err: &mut E,
269    ) -> std::io::Result<()> {
270        match render_stdout(args.format, &self.findings) {
271            Ok(rendered) => write!(out, "{rendered}"),
272            Err(report_error) => {
273                self.any_error = true;
274                writeln!(err, "error: {report_error}")
275            },
276        }
277    }
278
279    /// Phase 3: auto-write `report.sarif` + `report.md` unless there is nothing
280    /// to report.
281    fn emit_reports<E: Write>(&mut self, args: &Args, err: &mut E) -> std::io::Result<()> {
282        if self.findings.is_empty() {
283            return Ok(());
284        }
285        if let Err(report_error) = write_reports(&args.output_dir, &self.findings) {
286            self.any_error = true;
287            writeln!(
288                err,
289                "error: writing reports to {}: {report_error}",
290                args.output_dir.display()
291            )?;
292        }
293        Ok(())
294    }
295
296    /// Validate one input's bytes (or report its read error) and accumulate the
297    /// resulting [`Finding`].
298    fn record<E: Write>(
299        &mut self,
300        label: &str,
301        bytes: Result<Vec<u8>, PathSecurityError>,
302        err: &mut E,
303    ) -> std::io::Result<()> {
304        match bytes {
305            Ok(bytes) => {
306                let verdict = validate(&bytes);
307                if !verdict.is_valid() {
308                    self.any_invalid = true;
309                }
310                self.findings.push(Finding {
311                    label: label.to_owned(),
312                    validity: verdict,
313                });
314            },
315            Err(error) => {
316                self.any_error = true;
317                writeln!(err, "error: {label}: {error}")?;
318            },
319        }
320        Ok(())
321    }
322
323    /// Final exit code: `2` on any read/report error, else `1` on any invalid
324    /// input, else `0`.
325    fn exit_code(&self) -> u8 {
326        if self.any_error {
327            2
328        } else {
329            u8::from(self.any_invalid)
330        }
331    }
332}
333
334#[cfg(test)]
335mod tests {
336    use std::io::Write as _;
337
338    use super::*;
339
340    fn temp_file(name: &str, bytes: &[u8]) -> (tempfile::TempDir, PathBuf) {
341        let dir = tempfile::tempdir().unwrap();
342        let path = dir.path().join(name);
343        let mut f = std::fs::File::create(&path).unwrap();
344        f.write_all(bytes).unwrap();
345        (dir, path)
346    }
347
348    fn args_from(items: &[&str]) -> Args {
349        // `--no-report` keeps unit tests from writing report files into the cwd;
350        // the report pipeline is covered by the integration tests.
351        let mut argv = vec!["simdutf8-cli", "--no-report"];
352        argv.extend_from_slice(items);
353        Args::try_parse_from(argv).expect("args should parse")
354    }
355
356    #[test]
357    fn reports_valid_file_with_exit_zero() {
358        let (_dir, path) = temp_file("ok.txt", "héllo".as_bytes());
359        let args = args_from(&[path.to_str().unwrap()]);
360        let mut out = Vec::new();
361        let mut err = Vec::new();
362        let code = run(&args, &mut out, &mut err).unwrap();
363        assert_eq!(code, 0);
364        assert!(String::from_utf8_lossy(&out).contains("OK"));
365    }
366
367    #[test]
368    fn reports_invalid_file_with_exit_one() {
369        let (_dir, path) = temp_file("bad.bin", b"a\xFFb");
370        let args = args_from(&[path.to_str().unwrap()]);
371        let mut out = Vec::new();
372        let mut err = Vec::new();
373        let code = run(&args, &mut out, &mut err).unwrap();
374        assert_eq!(code, 1);
375        assert!(String::from_utf8_lossy(&out).contains("FAIL"));
376    }
377
378    #[test]
379    fn json_format_emits_an_array() {
380        let (_dir, path) = temp_file("ok.txt", b"hi");
381        let args = args_from(&["--format", "json", path.to_str().unwrap()]);
382        let mut out = Vec::new();
383        let mut err = Vec::new();
384        let code = run(&args, &mut out, &mut err).unwrap();
385        assert_eq!(code, 0);
386        let s = String::from_utf8(out).unwrap();
387        assert!(s.trim_start().starts_with('['), "got: {s}");
388        assert!(s.contains(r#""valid":true"#), "got: {s}");
389    }
390
391    #[test]
392    fn quiet_suppresses_stdout() {
393        let (_dir, path) = temp_file("ok.txt", b"hi");
394        let args = args_from(&["-q", path.to_str().unwrap()]);
395        let mut out = Vec::new();
396        let mut err = Vec::new();
397        let code = run(&args, &mut out, &mut err).unwrap();
398        assert_eq!(code, 0);
399        assert!(out.is_empty(), "expected no output, got: {out:?}");
400    }
401
402    #[test]
403    fn missing_file_yields_exit_two() {
404        let dir = tempfile::tempdir().unwrap();
405        let missing = dir.path().join("does-not-exist");
406        let args = args_from(&[missing.to_str().unwrap()]);
407        let mut out = Vec::new();
408        let mut err = Vec::new();
409        let code = run(&args, &mut out, &mut err).unwrap();
410        assert_eq!(code, 2);
411        assert!(String::from_utf8_lossy(&err).contains("error"));
412    }
413
414    #[test]
415    fn base_dir_blocks_files_outside_it() {
416        let base = tempfile::tempdir().unwrap();
417        let (_outside_dir, outside) = temp_file("secret.txt", b"data");
418        let args = args_from(&[
419            "--base-dir",
420            base.path().to_str().unwrap(),
421            outside.to_str().unwrap(),
422        ]);
423        let mut out = Vec::new();
424        let mut err = Vec::new();
425        let code = run(&args, &mut out, &mut err).unwrap();
426        assert_eq!(code, 2);
427    }
428}
simdutf8_cli/cli.rs

simdutf8_cli/
cli.rs