Skip to main content

big_code_analysis/output/
sarif.rs

1//! SARIF 2.1.0 writer for [`OffenderRecord`] batches.
2//!
3//! SARIF (Static Analysis Results Interchange Format) is the OASIS
4//! standard ingested natively by GitHub Code Scanning and most modern
5//! IDE/security tooling. Lizard does not have a SARIF output, so this
6//! is the obvious modern target for `big-code-analysis` integrations.
7//!
8//! We model only the subset of SARIF we actually emit as a small set
9//! of `Serialize` structs (no `sarif` crate dependency). The shape:
10//!
11//! ```json
12//! {
13//!   "version": "2.1.0",
14//!   "$schema": "https://json.schemastore.org/sarif-2.1.0.json",
15//!   "runs": [{
16//!     "tool": { "driver": { "name": "big-code-analysis", "version": "...",
17//!                            "rules": [ { "id": "cyclomatic", ... } ] } },
18//!     "results": [ { "ruleId": "...", "level": "warning", ... } ]
19//!   }]
20//! }
21//! ```
22
23use std::collections::BTreeSet;
24use std::io::{self, Write};
25
26use serde::Serialize;
27
28#[cfg(test)]
29use crate::output::offenders::Severity;
30use crate::output::offenders::{OffenderRecord, TOOL_ID, warn_non_utf8_path};
31
32/// SARIF schema URL — pinned to 2.1.0 (the version GitHub Code
33/// Scanning ingests).
34const SARIF_SCHEMA: &str = "https://json.schemastore.org/sarif-2.1.0.json";
35const SARIF_VERSION: &str = "2.1.0";
36
37/// Short rule descriptions used in `tool.driver.rules[]`. Metrics not
38/// listed fall back to the metric name itself — never fail.
39const RULE_DESCRIPTIONS: &[(&str, &str)] = &[
40    (
41        "cyclomatic",
42        "Cyclomatic Complexity exceeds the configured threshold.",
43    ),
44    (
45        "cognitive",
46        "Cognitive Complexity exceeds the configured threshold.",
47    ),
48    (
49        "loc.sloc",
50        "Source lines of code exceed the configured threshold.",
51    ),
52    (
53        "loc.ploc",
54        "Physical lines of code exceed the configured threshold.",
55    ),
56    (
57        "loc.lloc",
58        "Logical lines of code exceed the configured threshold.",
59    ),
60    (
61        "loc.cloc",
62        "Comment lines of code exceed the configured threshold.",
63    ),
64    (
65        "loc.blank",
66        "Blank lines of code exceed the configured threshold.",
67    ),
68    (
69        "halstead.volume",
70        "Halstead volume exceeds the configured threshold.",
71    ),
72    (
73        "halstead.difficulty",
74        "Halstead difficulty exceeds the configured threshold.",
75    ),
76    (
77        "halstead.effort",
78        "Halstead effort exceeds the configured threshold.",
79    ),
80    (
81        "halstead.bugs",
82        "Estimated Halstead bugs exceed the configured threshold.",
83    ),
84    (
85        "nargs.total",
86        "Number of function arguments exceeds the configured threshold.",
87    ),
88    (
89        "nexits.sum",
90        "Number of exit points exceeds the configured threshold.",
91    ),
92    (
93        "nom.total",
94        "Number of methods/functions exceeds the configured threshold.",
95    ),
96    (
97        "npa.total",
98        "Number of public attributes exceeds the configured threshold.",
99    ),
100    (
101        "npm.total",
102        "Number of public methods exceeds the configured threshold.",
103    ),
104    (
105        "abc.magnitude",
106        "ABC magnitude exceeds the configured threshold.",
107    ),
108    (
109        "wmc.total",
110        "Weighted Methods per Class exceeds the configured threshold.",
111    ),
112    (
113        "mi.mi_original",
114        "Maintainability Index falls below the configured threshold.",
115    ),
116    (
117        "mi.mi_sei",
118        "Maintainability Index (SEI) falls below the configured threshold.",
119    ),
120    (
121        "mi.mi_visual_studio",
122        "Maintainability Index (Visual Studio) falls below the configured threshold.",
123    ),
124];
125
126fn rule_description(metric: &str) -> &str {
127    RULE_DESCRIPTIONS
128        .iter()
129        .find_map(|(name, desc)| (*name == metric).then_some(*desc))
130        .unwrap_or(metric)
131}
132
133/// Convert an OS path string into a SARIF `artifactLocation.uri`
134/// value (an RFC 3986 URI reference).
135///
136/// SARIF 2.1.0 §3.4.4 requires `artifactLocation.uri` be a valid URI
137/// reference. Backslash separators (Windows paths) and characters
138/// outside the URI unreserved/reserved sets break that — the
139/// json-schema validator GitHub Code Scanning uses rejects them
140/// under the `uri-reference` format. We:
141///
142/// - Normalize separators to `/`.
143/// - Percent-encode any byte outside the URI unreserved set + `/`
144///   so spaces and other path characters survive validation.
145/// - For absolute Windows paths beginning with a drive letter
146///   (`C:\…` → `C:/…`), prefix with `file:///` so the leading `C:`
147///   is not interpreted as a URI scheme.
148fn path_to_uri_reference(path: &str) -> String {
149    let bytes = path.as_bytes();
150    let is_windows_drive_abs = bytes.len() >= 2
151        && bytes[0].is_ascii_alphabetic()
152        && bytes[1] == b':'
153        && (bytes.len() == 2 || bytes[2] == b'/' || bytes[2] == b'\\');
154
155    let mut out = String::with_capacity(path.len() + if is_windows_drive_abs { 8 } else { 0 });
156    if is_windows_drive_abs {
157        out.push_str("file:///");
158    }
159    for &b in bytes {
160        match b {
161            b'\\' => out.push('/'),
162            // RFC 3986 unreserved + path separator + segment-safe sub-delims +
163            // ':' '@' (allowed in path) + '%' would need its own escaping but
164            // raw paths from the OS will not contain it pre-encoded.
165            b'A'..=b'Z'
166            | b'a'..=b'z'
167            | b'0'..=b'9'
168            | b'-'
169            | b'.'
170            | b'_'
171            | b'~'
172            | b'/'
173            | b':'
174            | b'@' => out.push(b as char),
175            _ => {
176                let hi = b >> 4;
177                let lo = b & 0xF;
178                out.push('%');
179                out.push(hex_digit(hi));
180                out.push(hex_digit(lo));
181            }
182        }
183    }
184    out
185}
186
187fn hex_digit(nibble: u8) -> char {
188    match nibble {
189        0..=9 => (b'0' + nibble) as char,
190        10..=15 => (b'A' + nibble - 10) as char,
191        _ => '0',
192    }
193}
194
195/// Write a SARIF 2.1.0 document for `offenders` to `writer`.
196///
197/// Offenders whose path is not valid UTF-8 are skipped with a warning
198/// to stderr (SARIF `artifactLocation.uri` requires a UTF-8 string).
199/// The empty case emits a well-formed run with empty `results: []` and
200/// `rules: []` so snapshots are stable and CI consumers can already
201/// integrate before the threshold engine (#96) lands.
202///
203/// # Errors
204///
205/// Returns any [`io::Error`] produced by `writer` while emitting the
206/// SARIF JSON document, or a `serde_json::Error` (mapped to `io::Error`
207/// via `io::Error::other`) if a record cannot be serialised.
208pub fn write_sarif<W: Write>(offenders: &[OffenderRecord], mut writer: W) -> io::Result<()> {
209    let mut results: Vec<SarifResult<'_>> = Vec::with_capacity(offenders.len());
210    // BTreeSet so the rules array is deterministic (alphabetical by id).
211    let mut rule_ids: BTreeSet<&str> = BTreeSet::new();
212
213    for record in offenders {
214        let Some(path_str) = warn_non_utf8_path("SARIF", &record.path) else {
215            continue;
216        };
217        rule_ids.insert(record.metric.as_str());
218
219        let logical_locations = record.function.as_deref().map(|name| {
220            vec![LogicalLocation {
221                fully_qualified_name: name,
222            }]
223        });
224
225        results.push(SarifResult {
226            rule_id: &record.metric,
227            level: record.severity.as_str(),
228            message: Message {
229                text: record.default_message(),
230            },
231            locations: vec![Location {
232                physical_location: PhysicalLocation {
233                    artifact_location: ArtifactLocation {
234                        uri: path_to_uri_reference(path_str),
235                    },
236                    region: Region {
237                        start_line: record.start_line.max(1),
238                        end_line: Some(record.end_line.max(record.start_line.max(1))),
239                        start_column: record.start_col,
240                    },
241                },
242                logical_locations,
243            }],
244        });
245    }
246
247    let rules: Vec<Rule<'_>> = rule_ids
248        .iter()
249        .map(|id| Rule {
250            id,
251            short_description: Description {
252                text: rule_description(id),
253            },
254        })
255        .collect();
256
257    let log = SarifLog {
258        schema: SARIF_SCHEMA,
259        version: SARIF_VERSION,
260        runs: vec![Run {
261            tool: Tool {
262                driver: Driver {
263                    name: TOOL_ID,
264                    version: env!("CARGO_PKG_VERSION"),
265                    rules,
266                },
267            },
268            results,
269        }],
270    };
271
272    serde_json::to_writer_pretty(&mut writer, &log)
273        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
274    // `serde_json::to_writer_pretty` does not append a trailing
275    // newline; add one so the output is POSIX-friendly and snapshot
276    // diffs stay clean.
277    writer.write_all(b"\n")
278}
279
280#[derive(Serialize)]
281struct SarifLog<'a> {
282    #[serde(rename = "$schema")]
283    schema: &'a str,
284    version: &'a str,
285    runs: Vec<Run<'a>>,
286}
287
288#[derive(Serialize)]
289struct Run<'a> {
290    tool: Tool<'a>,
291    results: Vec<SarifResult<'a>>,
292}
293
294#[derive(Serialize)]
295struct Tool<'a> {
296    driver: Driver<'a>,
297}
298
299#[derive(Serialize)]
300struct Driver<'a> {
301    name: &'a str,
302    version: &'a str,
303    rules: Vec<Rule<'a>>,
304}
305
306#[derive(Serialize)]
307struct Rule<'a> {
308    id: &'a str,
309    #[serde(rename = "shortDescription")]
310    short_description: Description<'a>,
311}
312
313#[derive(Serialize)]
314struct Description<'a> {
315    text: &'a str,
316}
317
318#[derive(Serialize)]
319#[serde(rename_all = "camelCase")]
320struct SarifResult<'a> {
321    rule_id: &'a str,
322    level: &'static str,
323    message: Message,
324    locations: Vec<Location<'a>>,
325}
326
327#[derive(Serialize)]
328struct Message {
329    text: String,
330}
331
332#[derive(Serialize)]
333#[serde(rename_all = "camelCase")]
334struct Location<'a> {
335    physical_location: PhysicalLocation,
336    #[serde(skip_serializing_if = "Option::is_none")]
337    logical_locations: Option<Vec<LogicalLocation<'a>>>,
338}
339
340#[derive(Serialize)]
341#[serde(rename_all = "camelCase")]
342struct PhysicalLocation {
343    artifact_location: ArtifactLocation,
344    region: Region,
345}
346
347#[derive(Serialize)]
348struct ArtifactLocation {
349    uri: String,
350}
351
352#[derive(Serialize)]
353#[serde(rename_all = "camelCase")]
354struct Region {
355    start_line: u32,
356    #[serde(skip_serializing_if = "Option::is_none")]
357    end_line: Option<u32>,
358    #[serde(skip_serializing_if = "Option::is_none")]
359    start_column: Option<u32>,
360}
361
362#[derive(Serialize)]
363#[serde(rename_all = "camelCase")]
364struct LogicalLocation<'a> {
365    fully_qualified_name: &'a str,
366}
367
368#[cfg(test)]
369#[allow(
370    clippy::float_cmp,
371    clippy::cast_precision_loss,
372    clippy::cast_possible_truncation,
373    clippy::cast_sign_loss,
374    clippy::similar_names,
375    clippy::doc_markdown,
376    clippy::needless_raw_string_hashes,
377    clippy::too_many_lines
378)]
379mod tests {
380    use super::*;
381    use std::path::PathBuf;
382
383    fn rec(path: &str, metric: &str, value: f64, limit: f64) -> OffenderRecord {
384        OffenderRecord {
385            path: PathBuf::from(path),
386            function: Some("f".into()),
387            start_line: 42,
388            end_line: 50,
389            start_col: Some(5),
390            metric: metric.into(),
391            value,
392            limit,
393            severity: Severity::Warning,
394        }
395    }
396
397    fn render(offenders: &[OffenderRecord]) -> String {
398        let mut buf = Vec::new();
399        write_sarif(offenders, &mut buf).expect("writing to Vec is infallible");
400        String::from_utf8(buf).expect("output is UTF-8")
401    }
402
403    #[test]
404    fn empty_emits_minimal_valid_run() {
405        let out = render(&[]);
406        // Round-trips cleanly through serde_json so we know it parses.
407        let v: serde_json::Value = serde_json::from_str(&out).expect("valid JSON");
408        assert_eq!(v["version"], "2.1.0");
409        assert_eq!(v["runs"][0]["tool"]["driver"]["name"], "big-code-analysis");
410        assert!(
411            v["runs"][0]["results"]
412                .as_array()
413                .expect("array")
414                .is_empty()
415        );
416        assert!(
417            v["runs"][0]["tool"]["driver"]["rules"]
418                .as_array()
419                .expect("array")
420                .is_empty()
421        );
422    }
423
424    #[test]
425    fn single_offender_includes_rule_and_result() {
426        let offenders = vec![rec("src/foo.rs", "cyclomatic", 17.0, 15.0)];
427        let out = render(&offenders);
428        let v: serde_json::Value = serde_json::from_str(&out).expect("valid JSON");
429        let result = &v["runs"][0]["results"][0];
430        assert_eq!(result["ruleId"], "cyclomatic");
431        assert_eq!(result["level"], "warning");
432        assert_eq!(result["message"]["text"], "cyclomatic 17 exceeds limit 15");
433        let loc = &result["locations"][0];
434        assert_eq!(
435            loc["physicalLocation"]["artifactLocation"]["uri"],
436            "src/foo.rs"
437        );
438        assert_eq!(loc["physicalLocation"]["region"]["startLine"], 42);
439        assert_eq!(loc["physicalLocation"]["region"]["endLine"], 50);
440        assert_eq!(loc["physicalLocation"]["region"]["startColumn"], 5);
441        assert_eq!(loc["logicalLocations"][0]["fullyQualifiedName"], "f");
442
443        let rule = &v["runs"][0]["tool"]["driver"]["rules"][0];
444        assert_eq!(rule["id"], "cyclomatic");
445        assert!(rule["shortDescription"]["text"].is_string());
446    }
447
448    #[test]
449    fn error_severity_maps_to_error_level() {
450        let mut r = rec("a.rs", "cyclomatic", 99.0, 15.0);
451        r.severity = Severity::Error;
452        let out = render(&[r]);
453        let v: serde_json::Value = serde_json::from_str(&out).expect("valid JSON");
454        assert_eq!(v["runs"][0]["results"][0]["level"], "error");
455    }
456
457    #[test]
458    fn missing_column_omits_field() {
459        let mut r = rec("a.rs", "cyclomatic", 17.0, 15.0);
460        r.start_col = None;
461        let out = render(&[r]);
462        assert!(!out.contains("startColumn"), "{out}");
463    }
464
465    #[test]
466    fn missing_function_omits_logical_locations() {
467        let mut r = rec("a.rs", "cyclomatic", 17.0, 15.0);
468        r.function = None;
469        let out = render(&[r]);
470        assert!(!out.contains("logicalLocations"), "{out}");
471    }
472
473    #[test]
474    fn rules_deduplicate_per_metric() {
475        let offenders = vec![
476            rec("a.rs", "cyclomatic", 17.0, 15.0),
477            rec("b.rs", "cyclomatic", 20.0, 15.0),
478            rec("a.rs", "loc.lloc", 250.0, 100.0),
479        ];
480        let out = render(&offenders);
481        let v: serde_json::Value = serde_json::from_str(&out).expect("valid JSON");
482        let rules = v["runs"][0]["tool"]["driver"]["rules"]
483            .as_array()
484            .expect("array");
485        assert_eq!(rules.len(), 2);
486        // BTreeSet iteration order: alphabetical.
487        assert_eq!(rules[0]["id"], "cyclomatic");
488        assert_eq!(rules[1]["id"], "loc.lloc");
489    }
490
491    #[test]
492    fn unknown_metric_falls_back_to_metric_name_as_description() {
493        let r = rec("a.rs", "made.up.metric", 1.0, 0.0);
494        let out = render(&[r]);
495        let v: serde_json::Value = serde_json::from_str(&out).expect("valid JSON");
496        assert_eq!(
497            v["runs"][0]["tool"]["driver"]["rules"][0]["shortDescription"]["text"],
498            "made.up.metric"
499        );
500    }
501
502    #[test]
503    fn start_line_zero_is_clamped_to_one() {
504        let mut r = rec("a.rs", "cyclomatic", 17.0, 15.0);
505        r.start_line = 0;
506        r.end_line = 0;
507        let out = render(&[r]);
508        let v: serde_json::Value = serde_json::from_str(&out).expect("valid JSON");
509        assert_eq!(
510            v["runs"][0]["results"][0]["locations"][0]["physicalLocation"]["region"]["startLine"],
511            1
512        );
513    }
514
515    #[test]
516    fn driver_version_matches_pkg_version() {
517        let out = render(&[]);
518        let v: serde_json::Value = serde_json::from_str(&out).expect("valid JSON");
519        assert_eq!(
520            v["runs"][0]["tool"]["driver"]["version"],
521            env!("CARGO_PKG_VERSION")
522        );
523    }
524
525    #[test]
526    fn windows_drive_path_becomes_file_uri() {
527        // Windows absolute path: backslashes flip to /, drive letter
528        // gets wrapped in `file:///` so it isn't parsed as a scheme.
529        assert_eq!(
530            path_to_uri_reference(r"C:\Users\RUNNER~1\AppData\Local\Temp\fixture.rs"),
531            "file:///C:/Users/RUNNER~1/AppData/Local/Temp/fixture.rs"
532        );
533    }
534
535    #[test]
536    fn posix_relative_path_is_unchanged() {
537        assert_eq!(path_to_uri_reference("src/foo.rs"), "src/foo.rs");
538    }
539
540    #[test]
541    fn posix_absolute_path_keeps_leading_slash() {
542        assert_eq!(path_to_uri_reference("/tmp/foo.rs"), "/tmp/foo.rs");
543    }
544
545    #[test]
546    fn space_is_percent_encoded() {
547        assert_eq!(path_to_uri_reference("src/my file.rs"), "src/my%20file.rs");
548    }
549
550    #[test]
551    fn empty_snapshot_is_stable() {
552        insta::assert_snapshot!("sarif_empty", render(&[]));
553    }
554
555    #[test]
556    fn multi_offender_snapshot_is_stable() {
557        let mut err = rec("src/zeta.rs", "cognitive", 30.0, 15.0);
558        err.severity = Severity::Error;
559        err.start_col = None;
560        err.function = None;
561        let offenders = vec![
562            rec("src/alpha.rs", "cyclomatic", 17.0, 15.0),
563            rec("src/alpha.rs", "loc.lloc", 250.0, 100.0),
564            err,
565        ];
566        insta::assert_snapshot!("sarif_multi", render(&offenders));
567    }
568}