simdutf8-cli 0.1.6

SIMD-accelerated UTF-8 validation CLI built on the simdutf8 crate, with hardened path handling.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: 2025,2026 ndaal Gesellschaft für Sicherheit in der Informationstechnik mbH & Co KG, Cologne
// SPDX-FileCopyrightText: Author: Pierre Gronau <Pierre.Gronau@ndaal.eu>

//! Rendering of [`Validity`] verdicts as text, JSON, SARIF 2.1.0, or Markdown.
//!
//! Plain text and JSON are emitted without a serialization framework: a single,
//! audited [`json_escape`] routine handles string escaping so the output is
//! always valid JSON regardless of the bytes present in a file path.
//!
//! SARIF and Markdown follow `skills/rust-sarif.md`: SARIF 2.1.0 is produced and
//! strict-validated with [`sarif_rust`], and Markdown is derived from that SARIF
//! via [`sarif_to_md_core`]. This keeps the structured output spec-compliant for
//! CI ingestion, code review, and compliance tooling.

use sarif_rust::parser::SarifValidator;
use sarif_rust::{Level, ResultBuilder, RunBuilder, SarifLogBuilder, ToolBuilder};
use sarif_to_md_core::markdown::sarif::generator::SarifMarkdownGenerator;
use sarif_to_md_core::markdown::MarkdownFormat;
use sarif_to_md_core::ReportProcessorBuilder;

use crate::validate::Validity;

/// SARIF rule id reported for every non-UTF-8 input.
const RULE_ID: &str = "invalid-utf8";
/// Tool name embedded in SARIF / Markdown reports.
const TOOL_NAME: &str = "simdutf8-cli";

/// Output format selected on the command line.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, clap::ValueEnum)]
pub enum OutputFormat {
    /// One human-readable line per input.
    #[default]
    Text,
    /// A JSON array with one object per input.
    Json,
    /// SARIF 2.1.0 JSON (strict-validated), for CI ingestion.
    Sarif,
    /// GitHub-Flavored Markdown derived from the SARIF report.
    Markdown,
}

/// A single input's validation outcome, carried through to whichever output
/// format is rendered at the end of a run.
#[derive(Clone, Debug)]
pub struct Finding {
    /// Display label for the input (a file path, or `<stdin>`).
    pub label: String,
    /// The validation verdict for the input.
    pub validity: Validity,
}

/// Errors from building or converting structured (SARIF / Markdown) reports.
#[derive(Debug, thiserror::Error)]
pub enum ReportError {
    /// SARIF generation or strict validation failed.
    #[error("SARIF generation failed: {0}")]
    Sarif(String),
    /// SARIF-to-Markdown conversion or validation failed.
    #[error("Markdown generation failed: {0}")]
    Markdown(String),
}

/// Escape a string for safe inclusion inside a JSON string literal.
///
/// Escapes `"`, `\\`, the C0 control range (`U+0000..=U+001F`) and the common
/// shorthand escapes. The result does *not* include the surrounding quotes.
#[must_use]
pub fn json_escape(input: &str) -> String {
    let mut out = String::with_capacity(input.len());
    for ch in input.chars() {
        match ch {
            '"' => out.push_str("\\\""),
            '\\' => out.push_str("\\\\"),
            '\n' => out.push_str("\\n"),
            '\r' => out.push_str("\\r"),
            '\t' => out.push_str("\\t"),
            '\u{0008}' => out.push_str("\\b"),
            '\u{000C}' => out.push_str("\\f"),
            c if u32::from(c) < 0x20 => {
                // C0 control char (< U+0020): emit a \u00XX escape.
                let byte = u8::try_from(u32::from(c)).unwrap_or(0);
                out.push_str("\\u00");
                out.push(hex_nibble(byte >> 4));
                out.push(hex_nibble(byte & 0x0F));
            },
            c => out.push(c),
        }
    }
    out
}

/// Map a 4-bit value (`0..=15`) to its lowercase hex digit, without indexing.
const fn hex_nibble(nibble: u8) -> char {
    match nibble {
        0..=9 => (b'0' + nibble) as char,
        _ => (b'a' + nibble - 10) as char,
    }
}

/// Percent-encode a (possibly non-ASCII, space-bearing) path into a valid
/// RFC 3986 URI reference for use as a SARIF `artifactLocation.uri`.
///
/// Path separators (`/`) and the unreserved set are preserved; every other byte
/// — spaces, `:`, `?`, and all non-ASCII UTF-8 bytes — is percent-encoded. This
/// keeps strict SARIF URI validation happy for arbitrary file names while the
/// human-readable name remains in the result message.
fn path_to_uri(path: &str) -> String {
    let mut out = String::with_capacity(path.len());
    for &byte in path.as_bytes() {
        match byte {
            b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'.' | b'_' | b'~' | b'/' => {
                out.push(char::from(byte));
            },
            _ => {
                out.push('%');
                out.push(hex_nibble(byte >> 4).to_ascii_uppercase());
                out.push(hex_nibble(byte & 0x0F).to_ascii_uppercase());
            },
        }
    }
    if out.is_empty() {
        out.push('.');
    }
    out
}

/// Human-readable detail describing an invalid sequence.
fn invalid_detail(error_len: Option<usize>) -> String {
    match error_len {
        Some(1) => "1 invalid byte".to_owned(),
        Some(n) => format!("{n} invalid bytes"),
        None => "incomplete sequence".to_owned(),
    }
}

/// Render a single verdict as one human-readable line (no trailing newline).
#[must_use]
pub fn text_line(label: &str, validity: &Validity) -> String {
    match validity {
        Validity::Valid => format!("OK    {label}"),
        Validity::Invalid {
            valid_up_to,
            error_len,
        } => {
            let detail = invalid_detail(*error_len);
            format!("FAIL  {label}: invalid UTF-8 at byte {valid_up_to} ({detail})")
        },
    }
}

/// Render a single verdict as a JSON object (no trailing newline).
#[must_use]
pub fn json_record(label: &str, validity: &Validity) -> String {
    let path = json_escape(label);
    match validity {
        Validity::Valid => format!(r#"{{"path":"{path}","valid":true}}"#),
        Validity::Invalid {
            valid_up_to,
            error_len,
        } => {
            let error_len = error_len.map_or_else(|| "null".to_owned(), |len| len.to_string());
            format!(
                r#"{{"path":"{path}","valid":false,"valid_up_to":{valid_up_to},"error_len":{error_len}}}"#
            )
        },
    }
}

/// Render every finding as the human-readable text block (trailing newline).
#[must_use]
pub fn text_block(findings: &[Finding]) -> String {
    let mut out = String::new();
    for finding in findings {
        out.push_str(&text_line(&finding.label, &finding.validity));
        out.push('\n');
    }
    out
}

/// Render every finding as a JSON array (trailing newline).
#[must_use]
pub fn json_block(findings: &[Finding]) -> String {
    let records: Vec<String> = findings
        .iter()
        .map(|finding| json_record(&finding.label, &finding.validity))
        .collect();
    format!("[{}]\n", records.join(","))
}

/// Build a spec-compliant, strict-validated SARIF 2.1.0 document from findings.
///
/// Each invalid input becomes an `error`-level result and each valid input a
/// `none`-level result, all under the `invalid-utf8` rule declared on the tool.
///
/// # Errors
///
/// Returns [`ReportError::Sarif`] if the document fails to build or fails strict
/// SARIF validation.
pub fn build_sarif(findings: &[Finding]) -> std::result::Result<String, ReportError> {
    let tool = ToolBuilder::new(TOOL_NAME)
        .with_version(env!("CARGO_PKG_VERSION"))
        .add_simple_rule(RULE_ID, "Invalid UTF-8")
        .build();
    let mut run = RunBuilder::new(tool);

    for finding in findings {
        let (level, message) = match finding.validity {
            Validity::Valid => (Level::None, format!("{}: valid UTF-8", finding.label)),
            Validity::Invalid {
                valid_up_to,
                error_len,
            } => (
                Level::Error,
                format!(
                    "{}: invalid UTF-8 at byte {valid_up_to} ({})",
                    finding.label,
                    invalid_detail(error_len)
                ),
            ),
        };
        let result = ResultBuilder::with_text_message(message)
            .with_rule_id(RULE_ID)
            .with_level(level)
            // The location uri must be a valid URI reference; the readable name
            // stays in the message above.
            .add_file_location(path_to_uri(&finding.label), 1, 1)
            .build();
        run = run.add_result(result);
    }

    let log = SarifLogBuilder::with_standard_schema()
        .add_run(run.build())
        .build()
        .map_err(|error| ReportError::Sarif(error.to_string()))?;

    SarifValidator::strict()
        .validate_sarif_log(&log)
        .map_err(|error| ReportError::Sarif(error.to_string()))?;

    sarif_rust::to_string_pretty(&log).map_err(|error| ReportError::Sarif(error.to_string()))
}

/// Convert a SARIF JSON document to GitHub-Flavored Markdown, validating that
/// the result is non-empty and carries some structure.
///
/// # Errors
///
/// Returns [`ReportError::Markdown`] if conversion fails or the output is empty
/// or structureless.
pub fn sarif_to_markdown(sarif_json: &str) -> std::result::Result<String, ReportError> {
    let generator = SarifMarkdownGenerator::new(MarkdownFormat::GitHubFlavored, true);
    let processor = ReportProcessorBuilder::new()
        .generator(generator)
        .content(sarif_json.to_owned())
        .build()
        .map_err(|error| ReportError::Markdown(error.to_string()))?;
    let markdown = processor
        .generate()
        .map_err(|error| ReportError::Markdown(error.to_string()))?;
    validate_markdown(&markdown)?;
    Ok(markdown)
}

/// Validate generated Markdown: non-empty and containing some structure
/// (a heading, table, or thematic break).
///
/// # Errors
///
/// Returns [`ReportError::Markdown`] if `markdown` is empty or has no structure.
fn validate_markdown(markdown: &str) -> std::result::Result<(), ReportError> {
    if markdown.trim().is_empty() {
        return Err(ReportError::Markdown("output is empty".to_owned()));
    }
    if !markdown.contains('#') && !markdown.contains('|') && !markdown.contains("---") {
        return Err(ReportError::Markdown(
            "output missing expected structure (no headings, tables, or rules)".to_owned(),
        ));
    }
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn json_escape_passes_through_plain_text() {
        assert_eq!(json_escape("plain"), "plain");
    }

    #[test]
    fn json_escape_handles_quotes_and_backslashes() {
        assert_eq!(json_escape(r#"a"b\c"#), r#"a\"b\\c"#);
    }

    #[test]
    fn json_escape_handles_control_characters() {
        assert_eq!(json_escape("line\nbreak\ttab"), "line\\nbreak\\ttab");
        // A bare control char with no shorthand becomes a \u escape.
        assert_eq!(json_escape("\u{0001}"), "\\u0001");
    }

    #[test]
    fn text_line_marks_valid_inputs() {
        let line = text_line("file.txt", &Validity::Valid);
        assert!(line.contains("OK"), "got: {line}");
        assert!(line.contains("file.txt"), "got: {line}");
    }

    #[test]
    fn text_line_marks_invalid_inputs_with_location() {
        let v = Validity::Invalid {
            valid_up_to: 3,
            error_len: Some(1),
        };
        let line = text_line("bad.bin", &v);
        assert!(line.contains("FAIL"), "got: {line}");
        assert!(line.contains("bad.bin"), "got: {line}");
        assert!(line.contains('3'), "got: {line}");
    }

    #[test]
    fn json_record_for_valid_input() {
        let rec = json_record("file.txt", &Validity::Valid);
        assert!(rec.contains(r#""valid":true"#), "got: {rec}");
        assert!(rec.contains(r#""path":"file.txt""#), "got: {rec}");
    }

    #[test]
    fn json_record_for_invalid_input() {
        let v = Validity::Invalid {
            valid_up_to: 3,
            error_len: None,
        };
        let rec = json_record("bad.bin", &v);
        assert!(rec.contains(r#""valid":false"#), "got: {rec}");
        assert!(rec.contains(r#""valid_up_to":3"#), "got: {rec}");
        assert!(rec.contains(r#""error_len":null"#), "got: {rec}");
    }

    #[test]
    fn json_record_escapes_the_path() {
        let rec = json_record(r#"a"b"#, &Validity::Valid);
        assert!(rec.contains(r#""path":"a\"b""#), "got: {rec}");
    }

    fn sample_findings() -> Vec<Finding> {
        vec![
            Finding {
                label: "ok.txt".to_owned(),
                validity: Validity::Valid,
            },
            Finding {
                label: "bad.bin".to_owned(),
                validity: Validity::Invalid {
                    valid_up_to: 3,
                    error_len: Some(1),
                },
            },
        ]
    }

    #[test]
    fn build_sarif_is_valid_and_parses() {
        let json = build_sarif(&sample_findings()).expect("sarif builds & validates");
        assert!(json.contains("2.1.0"), "expected schema version: {json}");
        assert!(json.contains(RULE_ID), "expected rule id: {json}");
        // Round-trips through the parser (independent of our builder).
        let parsed = sarif_rust::from_str(&json);
        assert!(parsed.is_ok(), "SARIF should re-parse: {parsed:?}");
    }

    #[test]
    fn markdown_from_sarif_has_structure() {
        let json = build_sarif(&sample_findings()).unwrap();
        let md = sarif_to_markdown(&json).expect("markdown generates");
        assert!(
            md.contains('#') || md.contains('|') || md.contains("---"),
            "markdown should be structured: {md}"
        );
    }

    #[test]
    fn validate_markdown_rejects_empty() {
        assert!(validate_markdown("   \n  ").is_err());
    }

    #[test]
    fn build_sarif_handles_paths_with_spaces_and_unicode() {
        // Strict SARIF URI validation rejects raw spaces / non-ASCII, so these
        // must be percent-encoded into the artifact location.
        let findings = vec![
            Finding {
                label: "my café/файл .txt".to_owned(),
                validity: Validity::Invalid {
                    valid_up_to: 0,
                    error_len: Some(1),
                },
            },
            Finding {
                label: "C:/weird?name*.bin".to_owned(),
                validity: Validity::Valid,
            },
        ];
        let json = build_sarif(&findings).expect("tricky paths must still validate");
        assert!(sarif_rust::from_str(&json).is_ok());
        // The readable name is preserved in the message even though the uri is
        // percent-encoded.
        assert!(json.contains("my café/файл .txt"), "message lost the label");
    }

    #[test]
    fn path_to_uri_encodes_specials_keeps_separators() {
        assert_eq!(path_to_uri("a/b.txt"), "a/b.txt");
        assert_eq!(path_to_uri("a b"), "a%20b");
        assert_eq!(path_to_uri("café"), "caf%C3%A9");
        assert_eq!(path_to_uri(""), ".");
    }
}