Skip to main content

simdutf8_cli/
report.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: 2025,2026 ndaal Gesellschaft für Sicherheit in der Informationstechnik mbH & Co KG, Cologne
3// SPDX-FileCopyrightText: Author: Pierre Gronau <Pierre.Gronau@ndaal.eu>
4
5//! Rendering of [`Validity`] verdicts as text, JSON, SARIF 2.1.0, or Markdown.
6//!
7//! Plain text and JSON are emitted without a serialization framework: a single,
8//! audited [`json_escape`] routine handles string escaping so the output is
9//! always valid JSON regardless of the bytes present in a file path.
10//!
11//! SARIF and Markdown follow `skills/rust-sarif.md`: SARIF 2.1.0 is produced and
12//! strict-validated with [`sarif_rust`], and Markdown is derived from that SARIF
13//! via [`sarif_to_md_core`]. This keeps the structured output spec-compliant for
14//! CI ingestion, code review, and compliance tooling.
15
16use sarif_rust::parser::SarifValidator;
17use sarif_rust::{Level, ResultBuilder, RunBuilder, SarifLogBuilder, ToolBuilder};
18use sarif_to_md_core::markdown::sarif::generator::SarifMarkdownGenerator;
19use sarif_to_md_core::markdown::MarkdownFormat;
20use sarif_to_md_core::ReportProcessorBuilder;
21
22use crate::validate::Validity;
23
24/// SARIF rule id reported for every non-UTF-8 input.
25const RULE_ID: &str = "invalid-utf8";
26/// Tool name embedded in SARIF / Markdown reports.
27const TOOL_NAME: &str = "simdutf8-cli";
28
29/// Output format selected on the command line.
30#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, clap::ValueEnum)]
31pub enum OutputFormat {
32    /// One human-readable line per input.
33    #[default]
34    Text,
35    /// A JSON array with one object per input.
36    Json,
37    /// SARIF 2.1.0 JSON (strict-validated), for CI ingestion.
38    Sarif,
39    /// GitHub-Flavored Markdown derived from the SARIF report.
40    Markdown,
41}
42
43/// A single input's validation outcome, carried through to whichever output
44/// format is rendered at the end of a run.
45#[derive(Clone, Debug)]
46pub struct Finding {
47    /// Display label for the input (a file path, or `<stdin>`).
48    pub label: String,
49    /// The validation verdict for the input.
50    pub validity: Validity,
51}
52
53/// Errors from building or converting structured (SARIF / Markdown) reports.
54#[derive(Debug, thiserror::Error)]
55pub enum ReportError {
56    /// SARIF generation or strict validation failed.
57    #[error("SARIF generation failed: {0}")]
58    Sarif(String),
59    /// SARIF-to-Markdown conversion or validation failed.
60    #[error("Markdown generation failed: {0}")]
61    Markdown(String),
62}
63
64/// Escape a string for safe inclusion inside a JSON string literal.
65///
66/// Escapes `"`, `\\`, the C0 control range (`U+0000..=U+001F`) and the common
67/// shorthand escapes. The result does *not* include the surrounding quotes.
68#[must_use]
69pub fn json_escape(input: &str) -> String {
70    let mut out = String::with_capacity(input.len());
71    for ch in input.chars() {
72        match ch {
73            '"' => out.push_str("\\\""),
74            '\\' => out.push_str("\\\\"),
75            '\n' => out.push_str("\\n"),
76            '\r' => out.push_str("\\r"),
77            '\t' => out.push_str("\\t"),
78            '\u{0008}' => out.push_str("\\b"),
79            '\u{000C}' => out.push_str("\\f"),
80            c if u32::from(c) < 0x20 => {
81                // C0 control char (< U+0020): emit a \u00XX escape.
82                let byte = u8::try_from(u32::from(c)).unwrap_or(0);
83                out.push_str("\\u00");
84                out.push(hex_nibble(byte >> 4));
85                out.push(hex_nibble(byte & 0x0F));
86            },
87            c => out.push(c),
88        }
89    }
90    out
91}
92
93/// Map a 4-bit value (`0..=15`) to its lowercase hex digit, without indexing.
94const fn hex_nibble(nibble: u8) -> char {
95    match nibble {
96        0..=9 => (b'0' + nibble) as char,
97        _ => (b'a' + nibble - 10) as char,
98    }
99}
100
101/// Percent-encode a (possibly non-ASCII, space-bearing) path into a valid
102/// RFC 3986 URI reference for use as a SARIF `artifactLocation.uri`.
103///
104/// Path separators (`/`) and the unreserved set are preserved; every other byte
105/// — spaces, `:`, `?`, and all non-ASCII UTF-8 bytes — is percent-encoded. This
106/// keeps strict SARIF URI validation happy for arbitrary file names while the
107/// human-readable name remains in the result message.
108fn path_to_uri(path: &str) -> String {
109    let mut out = String::with_capacity(path.len());
110    for &byte in path.as_bytes() {
111        match byte {
112            b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'.' | b'_' | b'~' | b'/' => {
113                out.push(char::from(byte));
114            },
115            _ => {
116                out.push('%');
117                out.push(hex_nibble(byte >> 4).to_ascii_uppercase());
118                out.push(hex_nibble(byte & 0x0F).to_ascii_uppercase());
119            },
120        }
121    }
122    if out.is_empty() {
123        out.push('.');
124    }
125    out
126}
127
128/// Human-readable detail describing an invalid sequence.
129fn invalid_detail(error_len: Option<usize>) -> String {
130    match error_len {
131        Some(1) => "1 invalid byte".to_owned(),
132        Some(n) => format!("{n} invalid bytes"),
133        None => "incomplete sequence".to_owned(),
134    }
135}
136
137/// Render a single verdict as one human-readable line (no trailing newline).
138#[must_use]
139pub fn text_line(label: &str, validity: &Validity) -> String {
140    match validity {
141        Validity::Valid => format!("OK    {label}"),
142        Validity::Invalid {
143            valid_up_to,
144            error_len,
145        } => {
146            let detail = invalid_detail(*error_len);
147            format!("FAIL  {label}: invalid UTF-8 at byte {valid_up_to} ({detail})")
148        },
149    }
150}
151
152/// Render a single verdict as a JSON object (no trailing newline).
153#[must_use]
154pub fn json_record(label: &str, validity: &Validity) -> String {
155    let path = json_escape(label);
156    match validity {
157        Validity::Valid => format!(r#"{{"path":"{path}","valid":true}}"#),
158        Validity::Invalid {
159            valid_up_to,
160            error_len,
161        } => {
162            let error_len = error_len.map_or_else(|| "null".to_owned(), |len| len.to_string());
163            format!(
164                r#"{{"path":"{path}","valid":false,"valid_up_to":{valid_up_to},"error_len":{error_len}}}"#
165            )
166        },
167    }
168}
169
170/// Render every finding as the human-readable text block (trailing newline).
171#[must_use]
172pub fn text_block(findings: &[Finding]) -> String {
173    let mut out = String::new();
174    for finding in findings {
175        out.push_str(&text_line(&finding.label, &finding.validity));
176        out.push('\n');
177    }
178    out
179}
180
181/// Render every finding as a JSON array (trailing newline).
182#[must_use]
183pub fn json_block(findings: &[Finding]) -> String {
184    let records: Vec<String> = findings
185        .iter()
186        .map(|finding| json_record(&finding.label, &finding.validity))
187        .collect();
188    format!("[{}]\n", records.join(","))
189}
190
191/// Build a spec-compliant, strict-validated SARIF 2.1.0 document from findings.
192///
193/// Each invalid input becomes an `error`-level result and each valid input a
194/// `none`-level result, all under the `invalid-utf8` rule declared on the tool.
195///
196/// # Errors
197///
198/// Returns [`ReportError::Sarif`] if the document fails to build or fails strict
199/// SARIF validation.
200pub fn build_sarif(findings: &[Finding]) -> std::result::Result<String, ReportError> {
201    let tool = ToolBuilder::new(TOOL_NAME)
202        .with_version(env!("CARGO_PKG_VERSION"))
203        .add_simple_rule(RULE_ID, "Invalid UTF-8")
204        .build();
205    let mut run = RunBuilder::new(tool);
206
207    for finding in findings {
208        let (level, message) = match finding.validity {
209            Validity::Valid => (Level::None, format!("{}: valid UTF-8", finding.label)),
210            Validity::Invalid {
211                valid_up_to,
212                error_len,
213            } => (
214                Level::Error,
215                format!(
216                    "{}: invalid UTF-8 at byte {valid_up_to} ({})",
217                    finding.label,
218                    invalid_detail(error_len)
219                ),
220            ),
221        };
222        let result = ResultBuilder::with_text_message(message)
223            .with_rule_id(RULE_ID)
224            .with_level(level)
225            // The location uri must be a valid URI reference; the readable name
226            // stays in the message above.
227            .add_file_location(path_to_uri(&finding.label), 1, 1)
228            .build();
229        run = run.add_result(result);
230    }
231
232    let log = SarifLogBuilder::with_standard_schema()
233        .add_run(run.build())
234        .build()
235        .map_err(|error| ReportError::Sarif(error.to_string()))?;
236
237    SarifValidator::strict()
238        .validate_sarif_log(&log)
239        .map_err(|error| ReportError::Sarif(error.to_string()))?;
240
241    sarif_rust::to_string_pretty(&log).map_err(|error| ReportError::Sarif(error.to_string()))
242}
243
244/// Convert a SARIF JSON document to GitHub-Flavored Markdown, validating that
245/// the result is non-empty and carries some structure.
246///
247/// # Errors
248///
249/// Returns [`ReportError::Markdown`] if conversion fails or the output is empty
250/// or structureless.
251pub fn sarif_to_markdown(sarif_json: &str) -> std::result::Result<String, ReportError> {
252    let generator = SarifMarkdownGenerator::new(MarkdownFormat::GitHubFlavored, true);
253    let processor = ReportProcessorBuilder::new()
254        .generator(generator)
255        .content(sarif_json.to_owned())
256        .build()
257        .map_err(|error| ReportError::Markdown(error.to_string()))?;
258    let markdown = processor
259        .generate()
260        .map_err(|error| ReportError::Markdown(error.to_string()))?;
261    validate_markdown(&markdown)?;
262    Ok(markdown)
263}
264
265/// Validate generated Markdown: non-empty and containing some structure
266/// (a heading, table, or thematic break).
267///
268/// # Errors
269///
270/// Returns [`ReportError::Markdown`] if `markdown` is empty or has no structure.
271fn validate_markdown(markdown: &str) -> std::result::Result<(), ReportError> {
272    if markdown.trim().is_empty() {
273        return Err(ReportError::Markdown("output is empty".to_owned()));
274    }
275    if !markdown.contains('#') && !markdown.contains('|') && !markdown.contains("---") {
276        return Err(ReportError::Markdown(
277            "output missing expected structure (no headings, tables, or rules)".to_owned(),
278        ));
279    }
280    Ok(())
281}
282
283#[cfg(test)]
284mod tests {
285    use super::*;
286
287    #[test]
288    fn json_escape_passes_through_plain_text() {
289        assert_eq!(json_escape("plain"), "plain");
290    }
291
292    #[test]
293    fn json_escape_handles_quotes_and_backslashes() {
294        assert_eq!(json_escape(r#"a"b\c"#), r#"a\"b\\c"#);
295    }
296
297    #[test]
298    fn json_escape_handles_control_characters() {
299        assert_eq!(json_escape("line\nbreak\ttab"), "line\\nbreak\\ttab");
300        // A bare control char with no shorthand becomes a \u escape.
301        assert_eq!(json_escape("\u{0001}"), "\\u0001");
302    }
303
304    #[test]
305    fn text_line_marks_valid_inputs() {
306        let line = text_line("file.txt", &Validity::Valid);
307        assert!(line.contains("OK"), "got: {line}");
308        assert!(line.contains("file.txt"), "got: {line}");
309    }
310
311    #[test]
312    fn text_line_marks_invalid_inputs_with_location() {
313        let v = Validity::Invalid {
314            valid_up_to: 3,
315            error_len: Some(1),
316        };
317        let line = text_line("bad.bin", &v);
318        assert!(line.contains("FAIL"), "got: {line}");
319        assert!(line.contains("bad.bin"), "got: {line}");
320        assert!(line.contains('3'), "got: {line}");
321    }
322
323    #[test]
324    fn json_record_for_valid_input() {
325        let rec = json_record("file.txt", &Validity::Valid);
326        assert!(rec.contains(r#""valid":true"#), "got: {rec}");
327        assert!(rec.contains(r#""path":"file.txt""#), "got: {rec}");
328    }
329
330    #[test]
331    fn json_record_for_invalid_input() {
332        let v = Validity::Invalid {
333            valid_up_to: 3,
334            error_len: None,
335        };
336        let rec = json_record("bad.bin", &v);
337        assert!(rec.contains(r#""valid":false"#), "got: {rec}");
338        assert!(rec.contains(r#""valid_up_to":3"#), "got: {rec}");
339        assert!(rec.contains(r#""error_len":null"#), "got: {rec}");
340    }
341
342    #[test]
343    fn json_record_escapes_the_path() {
344        let rec = json_record(r#"a"b"#, &Validity::Valid);
345        assert!(rec.contains(r#""path":"a\"b""#), "got: {rec}");
346    }
347
348    fn sample_findings() -> Vec<Finding> {
349        vec![
350            Finding {
351                label: "ok.txt".to_owned(),
352                validity: Validity::Valid,
353            },
354            Finding {
355                label: "bad.bin".to_owned(),
356                validity: Validity::Invalid {
357                    valid_up_to: 3,
358                    error_len: Some(1),
359                },
360            },
361        ]
362    }
363
364    #[test]
365    fn build_sarif_is_valid_and_parses() {
366        let json = build_sarif(&sample_findings()).expect("sarif builds & validates");
367        assert!(json.contains("2.1.0"), "expected schema version: {json}");
368        assert!(json.contains(RULE_ID), "expected rule id: {json}");
369        // Round-trips through the parser (independent of our builder).
370        let parsed = sarif_rust::from_str(&json);
371        assert!(parsed.is_ok(), "SARIF should re-parse: {parsed:?}");
372    }
373
374    #[test]
375    fn markdown_from_sarif_has_structure() {
376        let json = build_sarif(&sample_findings()).unwrap();
377        let md = sarif_to_markdown(&json).expect("markdown generates");
378        assert!(
379            md.contains('#') || md.contains('|') || md.contains("---"),
380            "markdown should be structured: {md}"
381        );
382    }
383
384    #[test]
385    fn validate_markdown_rejects_empty() {
386        assert!(validate_markdown("   \n  ").is_err());
387    }
388
389    #[test]
390    fn build_sarif_handles_paths_with_spaces_and_unicode() {
391        // Strict SARIF URI validation rejects raw spaces / non-ASCII, so these
392        // must be percent-encoded into the artifact location.
393        let findings = vec![
394            Finding {
395                label: "my café/файл .txt".to_owned(),
396                validity: Validity::Invalid {
397                    valid_up_to: 0,
398                    error_len: Some(1),
399                },
400            },
401            Finding {
402                label: "C:/weird?name*.bin".to_owned(),
403                validity: Validity::Valid,
404            },
405        ];
406        let json = build_sarif(&findings).expect("tricky paths must still validate");
407        assert!(sarif_rust::from_str(&json).is_ok());
408        // The readable name is preserved in the message even though the uri is
409        // percent-encoded.
410        assert!(json.contains("my café/файл .txt"), "message lost the label");
411    }
412
413    #[test]
414    fn path_to_uri_encodes_specials_keeps_separators() {
415        assert_eq!(path_to_uri("a/b.txt"), "a/b.txt");
416        assert_eq!(path_to_uri("a b"), "a%20b");
417        assert_eq!(path_to_uri("café"), "caf%C3%A9");
418        assert_eq!(path_to_uri(""), ".");
419    }
420}