use sarif_rust::parser::SarifValidator;
use sarif_rust::{Level, ResultBuilder, RunBuilder, SarifLogBuilder, ToolBuilder};
use sarif_to_md_core::markdown::sarif::generator::SarifMarkdownGenerator;
use sarif_to_md_core::markdown::MarkdownFormat;
use sarif_to_md_core::ReportProcessorBuilder;
use crate::validate::Validity;
const RULE_ID: &str = "invalid-utf8";
const TOOL_NAME: &str = "simdutf8-cli";
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, clap::ValueEnum)]
pub enum OutputFormat {
#[default]
Text,
Json,
Sarif,
Markdown,
}
#[derive(Clone, Debug)]
pub struct Finding {
pub label: String,
pub validity: Validity,
}
#[derive(Debug, thiserror::Error)]
pub enum ReportError {
#[error("SARIF generation failed: {0}")]
Sarif(String),
#[error("Markdown generation failed: {0}")]
Markdown(String),
}
#[must_use]
pub fn json_escape(input: &str) -> String {
let mut out = String::with_capacity(input.len());
for ch in input.chars() {
match ch {
'"' => out.push_str("\\\""),
'\\' => out.push_str("\\\\"),
'\n' => out.push_str("\\n"),
'\r' => out.push_str("\\r"),
'\t' => out.push_str("\\t"),
'\u{0008}' => out.push_str("\\b"),
'\u{000C}' => out.push_str("\\f"),
c if u32::from(c) < 0x20 => {
let byte = u8::try_from(u32::from(c)).unwrap_or(0);
out.push_str("\\u00");
out.push(hex_nibble(byte >> 4));
out.push(hex_nibble(byte & 0x0F));
},
c => out.push(c),
}
}
out
}
const fn hex_nibble(nibble: u8) -> char {
match nibble {
0..=9 => (b'0' + nibble) as char,
_ => (b'a' + nibble - 10) as char,
}
}
fn path_to_uri(path: &str) -> String {
let mut out = String::with_capacity(path.len());
for &byte in path.as_bytes() {
match byte {
b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'.' | b'_' | b'~' | b'/' => {
out.push(char::from(byte));
},
_ => {
out.push('%');
out.push(hex_nibble(byte >> 4).to_ascii_uppercase());
out.push(hex_nibble(byte & 0x0F).to_ascii_uppercase());
},
}
}
if out.is_empty() {
out.push('.');
}
out
}
fn invalid_detail(error_len: Option<usize>) -> String {
match error_len {
Some(1) => "1 invalid byte".to_owned(),
Some(n) => format!("{n} invalid bytes"),
None => "incomplete sequence".to_owned(),
}
}
#[must_use]
pub fn text_line(label: &str, validity: &Validity) -> String {
match validity {
Validity::Valid => format!("OK {label}"),
Validity::Invalid {
valid_up_to,
error_len,
} => {
let detail = invalid_detail(*error_len);
format!("FAIL {label}: invalid UTF-8 at byte {valid_up_to} ({detail})")
},
}
}
#[must_use]
pub fn json_record(label: &str, validity: &Validity) -> String {
let path = json_escape(label);
match validity {
Validity::Valid => format!(r#"{{"path":"{path}","valid":true}}"#),
Validity::Invalid {
valid_up_to,
error_len,
} => {
let error_len = error_len.map_or_else(|| "null".to_owned(), |len| len.to_string());
format!(
r#"{{"path":"{path}","valid":false,"valid_up_to":{valid_up_to},"error_len":{error_len}}}"#
)
},
}
}
#[must_use]
pub fn text_block(findings: &[Finding]) -> String {
let mut out = String::new();
for finding in findings {
out.push_str(&text_line(&finding.label, &finding.validity));
out.push('\n');
}
out
}
#[must_use]
pub fn json_block(findings: &[Finding]) -> String {
let records: Vec<String> = findings
.iter()
.map(|finding| json_record(&finding.label, &finding.validity))
.collect();
format!("[{}]\n", records.join(","))
}
pub fn build_sarif(findings: &[Finding]) -> std::result::Result<String, ReportError> {
let tool = ToolBuilder::new(TOOL_NAME)
.with_version(env!("CARGO_PKG_VERSION"))
.add_simple_rule(RULE_ID, "Invalid UTF-8")
.build();
let mut run = RunBuilder::new(tool);
for finding in findings {
let (level, message) = match finding.validity {
Validity::Valid => (Level::None, format!("{}: valid UTF-8", finding.label)),
Validity::Invalid {
valid_up_to,
error_len,
} => (
Level::Error,
format!(
"{}: invalid UTF-8 at byte {valid_up_to} ({})",
finding.label,
invalid_detail(error_len)
),
),
};
let result = ResultBuilder::with_text_message(message)
.with_rule_id(RULE_ID)
.with_level(level)
.add_file_location(path_to_uri(&finding.label), 1, 1)
.build();
run = run.add_result(result);
}
let log = SarifLogBuilder::with_standard_schema()
.add_run(run.build())
.build()
.map_err(|error| ReportError::Sarif(error.to_string()))?;
SarifValidator::strict()
.validate_sarif_log(&log)
.map_err(|error| ReportError::Sarif(error.to_string()))?;
sarif_rust::to_string_pretty(&log).map_err(|error| ReportError::Sarif(error.to_string()))
}
pub fn sarif_to_markdown(sarif_json: &str) -> std::result::Result<String, ReportError> {
let generator = SarifMarkdownGenerator::new(MarkdownFormat::GitHubFlavored, true);
let processor = ReportProcessorBuilder::new()
.generator(generator)
.content(sarif_json.to_owned())
.build()
.map_err(|error| ReportError::Markdown(error.to_string()))?;
let markdown = processor
.generate()
.map_err(|error| ReportError::Markdown(error.to_string()))?;
validate_markdown(&markdown)?;
Ok(markdown)
}
fn validate_markdown(markdown: &str) -> std::result::Result<(), ReportError> {
if markdown.trim().is_empty() {
return Err(ReportError::Markdown("output is empty".to_owned()));
}
if !markdown.contains('#') && !markdown.contains('|') && !markdown.contains("---") {
return Err(ReportError::Markdown(
"output missing expected structure (no headings, tables, or rules)".to_owned(),
));
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn json_escape_passes_through_plain_text() {
assert_eq!(json_escape("plain"), "plain");
}
#[test]
fn json_escape_handles_quotes_and_backslashes() {
assert_eq!(json_escape(r#"a"b\c"#), r#"a\"b\\c"#);
}
#[test]
fn json_escape_handles_control_characters() {
assert_eq!(json_escape("line\nbreak\ttab"), "line\\nbreak\\ttab");
assert_eq!(json_escape("\u{0001}"), "\\u0001");
}
#[test]
fn text_line_marks_valid_inputs() {
let line = text_line("file.txt", &Validity::Valid);
assert!(line.contains("OK"), "got: {line}");
assert!(line.contains("file.txt"), "got: {line}");
}
#[test]
fn text_line_marks_invalid_inputs_with_location() {
let v = Validity::Invalid {
valid_up_to: 3,
error_len: Some(1),
};
let line = text_line("bad.bin", &v);
assert!(line.contains("FAIL"), "got: {line}");
assert!(line.contains("bad.bin"), "got: {line}");
assert!(line.contains('3'), "got: {line}");
}
#[test]
fn json_record_for_valid_input() {
let rec = json_record("file.txt", &Validity::Valid);
assert!(rec.contains(r#""valid":true"#), "got: {rec}");
assert!(rec.contains(r#""path":"file.txt""#), "got: {rec}");
}
#[test]
fn json_record_for_invalid_input() {
let v = Validity::Invalid {
valid_up_to: 3,
error_len: None,
};
let rec = json_record("bad.bin", &v);
assert!(rec.contains(r#""valid":false"#), "got: {rec}");
assert!(rec.contains(r#""valid_up_to":3"#), "got: {rec}");
assert!(rec.contains(r#""error_len":null"#), "got: {rec}");
}
#[test]
fn json_record_escapes_the_path() {
let rec = json_record(r#"a"b"#, &Validity::Valid);
assert!(rec.contains(r#""path":"a\"b""#), "got: {rec}");
}
fn sample_findings() -> Vec<Finding> {
vec![
Finding {
label: "ok.txt".to_owned(),
validity: Validity::Valid,
},
Finding {
label: "bad.bin".to_owned(),
validity: Validity::Invalid {
valid_up_to: 3,
error_len: Some(1),
},
},
]
}
#[test]
fn build_sarif_is_valid_and_parses() {
let json = build_sarif(&sample_findings()).expect("sarif builds & validates");
assert!(json.contains("2.1.0"), "expected schema version: {json}");
assert!(json.contains(RULE_ID), "expected rule id: {json}");
let parsed = sarif_rust::from_str(&json);
assert!(parsed.is_ok(), "SARIF should re-parse: {parsed:?}");
}
#[test]
fn markdown_from_sarif_has_structure() {
let json = build_sarif(&sample_findings()).unwrap();
let md = sarif_to_markdown(&json).expect("markdown generates");
assert!(
md.contains('#') || md.contains('|') || md.contains("---"),
"markdown should be structured: {md}"
);
}
#[test]
fn validate_markdown_rejects_empty() {
assert!(validate_markdown(" \n ").is_err());
}
#[test]
fn build_sarif_handles_paths_with_spaces_and_unicode() {
let findings = vec![
Finding {
label: "my café/файл .txt".to_owned(),
validity: Validity::Invalid {
valid_up_to: 0,
error_len: Some(1),
},
},
Finding {
label: "C:/weird?name*.bin".to_owned(),
validity: Validity::Valid,
},
];
let json = build_sarif(&findings).expect("tricky paths must still validate");
assert!(sarif_rust::from_str(&json).is_ok());
assert!(json.contains("my café/файл .txt"), "message lost the label");
}
#[test]
fn path_to_uri_encodes_specials_keeps_separators() {
assert_eq!(path_to_uri("a/b.txt"), "a/b.txt");
assert_eq!(path_to_uri("a b"), "a%20b");
assert_eq!(path_to_uri("café"), "caf%C3%A9");
assert_eq!(path_to_uri(""), ".");
}
}