simdutf8-cli 0.1.6

// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: 2025,2026 ndaal Gesellschaft für Sicherheit in der Informationstechnik mbH & Co KG, Cologne
// SPDX-FileCopyrightText: Author: Pierre Gronau <Pierre.Gronau@ndaal.eu>

//! Rendering of [`Validity`] verdicts as text, JSON, SARIF 2.1.0, or Markdown.
//!
//! Plain text and JSON are emitted without a serialization framework: a single,
//! audited [`json_escape`] routine handles string escaping so the output is
//! always valid JSON regardless of the bytes present in a file path.
//!
//! SARIF and Markdown follow `skills/rust-sarif.md`: SARIF 2.1.0 is produced and
//! strict-validated with [`sarif_rust`], and Markdown is derived from that SARIF
//! via [`sarif_to_md_core`]. This keeps the structured output spec-compliant for
//! CI ingestion, code review, and compliance tooling.

use sarif_rust::parser::SarifValidator;
use sarif_rust::{Level, ResultBuilder, RunBuilder, SarifLogBuilder, ToolBuilder};
use sarif_to_md_core::markdown::sarif::generator::SarifMarkdownGenerator;
use sarif_to_md_core::markdown::MarkdownFormat;
use sarif_to_md_core::ReportProcessorBuilder;

use crate::validate::Validity;

/// SARIF rule id reported for every non-UTF-8 input.
const RULE_ID: &str = "invalid-utf8";
/// Tool name embedded in SARIF / Markdown reports.
const TOOL_NAME: &str = "simdutf8-cli";

/// Output format selected on the command line.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, clap::ValueEnum)]
pub enum OutputFormat {
    /// One human-readable line per input.
    #[default]
    Text,
    /// A JSON array with one object per input.
    Json,
    /// SARIF 2.1.0 JSON (strict-validated), for CI ingestion.
    Sarif,
    /// GitHub-Flavored Markdown derived from the SARIF report.
    Markdown,
}

/// A single input's validation outcome, carried through to whichever output
/// format is rendered at the end of a run.
#[derive(Clone, Debug)]
pub struct Finding {
    /// Display label for the input (a file path, or `<stdin>`).
    pub label: String,
    /// The validation verdict for the input.
    pub validity: Validity,
}

/// Errors from building or converting structured (SARIF / Markdown) reports.
#[derive(Debug, thiserror::Error)]
pub enum ReportError {
    /// SARIF generation or strict validation failed.
    #[error("SARIF generation failed: {0}")]
    Sarif(String),
    /// SARIF-to-Markdown conversion or validation failed.
    #[error("Markdown generation failed: {0}")]
    Markdown(String),
}

/// Escape a string for safe inclusion inside a JSON string literal.
///
/// Escapes `"`, `\\`, the C0 control range (`U+0000..=U+001F`) and the common
/// shorthand escapes. The result does *not* include the surrounding quotes.
#[must_use]
pub fn json_escape(input: &str) -> String {
    let mut out = String::with_capacity(input.len());
    for ch in input.chars() {
        match ch {
            '"' => out.push_str("\\\""),
            '\\' => out.push_str("\\\\"),
            '\n' => out.push_str("\\n"),
            '\r' => out.push_str("\\r"),
            '\t' => out.push_str("\\t"),
            '\u{0008}' => out.push_str("\\b"),
            '\u{000C}' => out.push_str("\\f"),
            c if u32::from(c) < 0x20 => {
                // C0 control char (< U+0020): emit a \u00XX escape.
                let byte = u8::try_from(u32::from(c)).unwrap_or(0);
                out.push_str("\\u00");
                out.push(hex_nibble(byte >> 4));
                out.push(hex_nibble(byte & 0x0F));
            },
            c => out.push(c),
        }
    }
    out
}

/// Map a 4-bit value (`0..=15`) to its lowercase hex digit, without indexing.
const fn hex_nibble(nibble: u8) -> char {
    match nibble {
        0..=9 => (b'0' + nibble) as char,
        _ => (b'a' + nibble - 10) as char,
    }
}

/// Percent-encode a (possibly non-ASCII, space-bearing) path into a valid
/// RFC 3986 URI reference for use as a SARIF `artifactLocation.uri`.
///
/// Path separators (`/`) and the unreserved set are preserved; every other byte
/// — spaces, `:`, `?`, and all non-ASCII UTF-8 bytes — is percent-encoded. This
/// keeps strict SARIF URI validation happy for arbitrary file names while the
/// human-readable name remains in the result message.
fn path_to_uri(path: &str) -> String {
    let mut out = String::with_capacity(path.len());
    for &byte in path.as_bytes() {
        match byte {
            b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'.' | b'_' | b'~' | b'/' => {
                out.push(char::from(byte));
            },
            _ => {
                out.push('%');
                out.push(hex_nibble(byte >> 4).to_ascii_uppercase());
                out.push(hex_nibble(byte & 0x0F).to_ascii_uppercase());
            },
        }
    }
    if out.is_empty() {
        out.push('.');
    }
    out
}

/// Human-readable detail describing an invalid sequence.
fn invalid_detail(error_len: Option<usize>) -> String {
    match error_len {
        Some(1) => "1 invalid byte".to_owned(),
        Some(n) => format!("{n} invalid bytes"),
        None => "incomplete sequence".to_owned(),
    }
}

/// Render a single verdict as one human-readable line (no trailing newline).
#[must_use]
pub fn text_line(label: &str, validity: &Validity) -> String {
    match validity {
        Validity::Valid => format!("OK    {label}"),
        Validity::Invalid {
            valid_up_to,
            error_len,
        } => {
            let detail = invalid_detail(*error_len);
            format!("FAIL  {label}: invalid UTF-8 at byte {valid_up_to} ({detail})")
        },
    }
}

/// Render a single verdict as a JSON object (no trailing newline).
#[must_use]
pub fn json_record(label: &str, validity: &Validity) -> String {
    let path = json_escape(label);
    match validity {
        Validity::Valid => format!(r#"{{"path":"{path}","valid":true}}"#),
        Validity::Invalid {
            valid_up_to,
            error_len,
        } => {
            let error_len = error_len.map_or_else(|| "null".to_owned(), |len| len.to_string());
            format!(
                r#"{{"path":"{path}","valid":false,"valid_up_to":{valid_up_to},"error_len":{error_len}}}"#
            )
        },
    }
}

/// Render every finding as the human-readable text block (trailing newline).
#[must_use]
pub fn text_block(findings: &[Finding]) -> String {
    let mut out = String::new();
    for finding in findings {
        out.push_str(&text_line(&finding.label, &finding.validity));
        out.push('\n');
    }
    out
}

/// Render every finding as a JSON array (trailing newline).
#[must_use]
pub fn json_block(findings: &[Finding]) -> String {
    let records: Vec<String> = findings
        .iter()
        .map(|finding| json_record(&finding.label, &finding.validity))
        .collect();
    format!("[{}]\n", records.join(","))
}

/// Build a spec-compliant, strict-validated SARIF 2.1.0 document from findings.
///
/// Each invalid input becomes an `error`-level result and each valid input a
/// `none`-level result, all under the `invalid-utf8` rule declared on the tool.
///
/// # Errors
///
/// Returns [`ReportError::Sarif`] if the document fails to build or fails strict
/// SARIF validation.
pub fn build_sarif(findings: &[Finding]) -> std::result::Result<String, ReportError> {
    let tool = ToolBuilder::new(TOOL_NAME)
        .with_version(env!("CARGO_PKG_VERSION"))
        .add_simple_rule(RULE_ID, "Invalid UTF-8")
        .build();
    let mut run = RunBuilder::new(tool);

    for finding in findings {
        let (level, message) = match finding.validity {
            Validity::Valid => (Level::None, format!("{}: valid UTF-8", finding.label)),
            Validity::Invalid {
                valid_up_to,
                error_len,
            } => (
                Level::Error,
                format!(
                    "{}: invalid UTF-8 at byte {valid_up_to} ({})",
                    finding.label,
                    invalid_detail(error_len)
                ),
            ),
        };
        let result = ResultBuilder::with_text_message(message)
            .with_rule_id(RULE_ID)
            .with_level(level)
            // The location uri must be a valid URI reference; the readable name
            // stays in the message above.
            .add_file_location(path_to_uri(&finding.label), 1, 1)
            .build();
        run = run.add_result(result);
    }

    let log = SarifLogBuilder::with_standard_schema()
        .add_run(run.build())
        .build()
        .map_err(|error| ReportError::Sarif(error.to_string()))?;

    SarifValidator::strict()
        .validate_sarif_log(&log)
        .map_err(|error| ReportError::Sarif(error.to_string()))?;

    sarif_rust::to_string_pretty(&log).map_err(|error| ReportError::Sarif(error.to_string()))
}

/// Convert a SARIF JSON document to GitHub-Flavored Markdown, validating that
/// the result is non-empty and carries some structure.
///
/// # Errors
///
/// Returns [`ReportError::Markdown`] if conversion fails or the output is empty
/// or structureless.
pub fn sarif_to_markdown(sarif_json: &str) -> std::result::Result<String, ReportError> {
    let generator = SarifMarkdownGenerator::new(MarkdownFormat::GitHubFlavored, true);
    let processor = ReportProcessorBuilder::new()
        .generator(generator)
        .content(sarif_json.to_owned())
        .build()
        .map_err(|error| ReportError::Markdown(error.to_string()))?;
    let markdown = processor
        .generate()
        .map_err(|error| ReportError::Markdown(error.to_string()))?;
    validate_markdown(&markdown)?;
    Ok(markdown)
}

/// Validate generated Markdown: non-empty and containing some structure
/// (a heading, table, or thematic break).
///
/// # Errors
///
/// Returns [`ReportError::Markdown`] if `markdown` is empty or has no structure.
fn validate_markdown(markdown: &str) -> std::result::Result<(), ReportError> {
    if markdown.trim().is_empty() {
        return Err(ReportError::Markdown("output is empty".to_owned()));
    }
    if !markdown.contains('#') && !markdown.contains('|') && !markdown.contains("---") {
        return Err(ReportError::Markdown(
            "output missing expected structure (no headings, tables, or rules)".to_owned(),
        ));
    }
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn json_escape_passes_through_plain_text() {
        assert_eq!(json_escape("plain"), "plain");
    }

    #[test]
    fn json_escape_handles_quotes_and_backslashes() {
        assert_eq!(json_escape(r#"a"b\c"#), r#"a\"b\\c"#);
    }

    #[test]
    fn json_escape_handles_control_characters() {
        assert_eq!(json_escape("line\nbreak\ttab"), "line\\nbreak\\ttab");
        // A bare control char with no shorthand becomes a \u escape.
        assert_eq!(json_escape("\u{0001}"), "\\u0001");
    }

    #[test]
    fn text_line_marks_valid_inputs() {
        let line = text_line("file.txt", &Validity::Valid);
        assert!(line.contains("OK"), "got: {line}");
        assert!(line.contains("file.txt"), "got: {line}");
    }

    #[test]
    fn text_line_marks_invalid_inputs_with_location() {
        let v = Validity::Invalid {
            valid_up_to: 3,
            error_len: Some(1),
        };
        let line = text_line("bad.bin", &v);
        assert!(line.contains("FAIL"), "got: {line}");
        assert!(line.contains("bad.bin"), "got: {line}");
        assert!(line.contains('3'), "got: {line}");
    }

    #[test]
    fn json_record_for_valid_input() {
        let rec = json_record("file.txt", &Validity::Valid);
        assert!(rec.contains(r#""valid":true"#), "got: {rec}");
        assert!(rec.contains(r#""path":"file.txt""#), "got: {rec}");
    }

    #[test]
    fn json_record_for_invalid_input() {
        let v = Validity::Invalid {
            valid_up_to: 3,
            error_len: None,
        };
        let rec = json_record("bad.bin", &v);
        assert!(rec.contains(r#""valid":false"#), "got: {rec}");
        assert!(rec.contains(r#""valid_up_to":3"#), "got: {rec}");
        assert!(rec.contains(r#""error_len":null"#), "got: {rec}");
    }

    #[test]
    fn json_record_escapes_the_path() {
        let rec = json_record(r#"a"b"#, &Validity::Valid);
        assert!(rec.contains(r#""path":"a\"b""#), "got: {rec}");
    }

    fn sample_findings() -> Vec<Finding> {
        vec![
            Finding {
                label: "ok.txt".to_owned(),
                validity: Validity::Valid,
            },
            Finding {
                label: "bad.bin".to_owned(),
                validity: Validity::Invalid {
                    valid_up_to: 3,
                    error_len: Some(1),
                },
            },
        ]
    }

    #[test]
    fn build_sarif_is_valid_and_parses() {
        let json = build_sarif(&sample_findings()).expect("sarif builds & validates");
        assert!(json.contains("2.1.0"), "expected schema version: {json}");
        assert!(json.contains(RULE_ID), "expected rule id: {json}");
        // Round-trips through the parser (independent of our builder).
        let parsed = sarif_rust::from_str(&json);
        assert!(parsed.is_ok(), "SARIF should re-parse: {parsed:?}");
    }

    #[test]
    fn markdown_from_sarif_has_structure() {
        let json = build_sarif(&sample_findings()).unwrap();
        let md = sarif_to_markdown(&json).expect("markdown generates");
        assert!(
            md.contains('#') || md.contains('|') || md.contains("---"),
            "markdown should be structured: {md}"
        );
    }

    #[test]
    fn validate_markdown_rejects_empty() {
        assert!(validate_markdown("   \n  ").is_err());
    }

    #[test]
    fn build_sarif_handles_paths_with_spaces_and_unicode() {
        // Strict SARIF URI validation rejects raw spaces / non-ASCII, so these
        // must be percent-encoded into the artifact location.
        let findings = vec![
            Finding {
                label: "my café/файл .txt".to_owned(),
                validity: Validity::Invalid {
                    valid_up_to: 0,
                    error_len: Some(1),
                },
            },
            Finding {
                label: "C:/weird?name*.bin".to_owned(),
                validity: Validity::Valid,
            },
        ];
        let json = build_sarif(&findings).expect("tricky paths must still validate");
        assert!(sarif_rust::from_str(&json).is_ok());
        // The readable name is preserved in the message even though the uri is
        // percent-encoded.
        assert!(json.contains("my café/файл .txt"), "message lost the label");
    }

    #[test]
    fn path_to_uri_encodes_specials_keeps_separators() {
        assert_eq!(path_to_uri("a/b.txt"), "a/b.txt");
        assert_eq!(path_to_uri("a b"), "a%20b");
        assert_eq!(path_to_uri("café"), "caf%C3%A9");
        assert_eq!(path_to_uri(""), ".");
    }
}