perf-sentinel-core 0.8.13

Core library for perf-sentinel: polyglot performance anti-pattern detector
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
//! Deterministic SHA-256 content hash for a [`PeriodicReport`].
//! Canonical form (sorted keys, compact JSON, blanked `content_hash`)
//! and design rationale: `docs/design/08-PERIODIC-DISCLOSURE.md`.

use std::collections::BTreeMap;
use std::fmt::Write as _;
use std::io::Read;

use serde_json::Value;
use sha2::{Digest, Sha256};

use super::errors::HashError;
use super::schema::PeriodicReport;

/// Soft cap on the binary read in [`binary_hash`]. perf-sentinel release
/// binaries are tens of MiB; this guards against `current_exe` resolving
/// to an unexpectedly large path (e.g. a procfs link).
const BINARY_HASH_MAX_BYTES: u64 = 256 * 1024 * 1024;

/// Compute the canonical SHA-256 content hash of a report.
///
/// The returned string is prefixed with `"sha256:"` and contains 64
/// lowercase hex characters.
///
/// # Errors
///
/// Returns [`HashError::Serialize`] if the report cannot be serialised to
/// JSON, which in practice only happens if a float is non-finite.
pub fn compute_content_hash(report: &PeriodicReport) -> Result<String, HashError> {
    let mut value = serde_json::to_value(report)?;
    blank_content_hash(&mut value);
    let canonical = canonicalize(value);
    let bytes = serde_json::to_vec(&canonical)?;
    Ok(format_sha256(&bytes))
}

/// JSON null/empty zero used when blanking a post-sign field in the
/// canonical form. `Null` matches the unsigned wire shape of typed
/// `Option<_>` locators, `EmptyString` matches scalar fields that
/// serialise as a non-empty string only after signing.
#[derive(Clone, Copy)]
enum BlankZero {
    Null,
    EmptyString,
}

/// Object/field pairs whose value is zeroed before computing the
/// canonical `content_hash`. These fields are populated or mutated
/// after the initial disclose run, so the hash that the signature
/// covers must not depend on them:
///
/// - `(integrity, content_hash)` is filled by disclose at write time.
/// - `(integrity, signature)` is filled by the operator after
///   `cosign attest-blob` succeeds and the locator is pasted in.
/// - `(integrity, binary_attestation)` is filled when the producing
///   binary carries SLSA provenance recorded post-build.
/// - `(report_metadata, integrity_level)` flips from `hash-only` to
///   `signed` or `signed-with-attestation` once a signature lands.
///
/// When a future schema revision adds a post-signing field (typed
/// `trace_integrity_chain`, an external audit signature, ...) append
/// it here. The hash invariant only holds for the exact set
/// declared, see the regression test in this module.
const POST_SIGN_FIELDS: &[(&str, &str, BlankZero)] = &[
    ("integrity", "content_hash", BlankZero::EmptyString),
    ("integrity", "signature", BlankZero::Null),
    ("integrity", "binary_attestation", BlankZero::Null),
    ("report_metadata", "integrity_level", BlankZero::EmptyString),
];

fn blank_content_hash(v: &mut Value) {
    for (parent, field, zero) in POST_SIGN_FIELDS {
        if let Some(obj) = v.get_mut(*parent).and_then(Value::as_object_mut) {
            let zeroed = match zero {
                BlankZero::Null => Value::Null,
                BlankZero::EmptyString => Value::String(String::new()),
            };
            obj.insert((*field).to_string(), zeroed);
        }
    }
}

/// Recursively re-build every JSON object via `BTreeMap` so the output
/// has sorted keys regardless of how `serde_json::Map` happens to be
/// configured upstream. Removing this collect would silently break the
/// hash determinism the moment a transitive crate flips the
/// `serde_json/preserve_order` feature.
fn canonicalize(v: Value) -> Value {
    match v {
        Value::Object(map) => {
            let sorted: BTreeMap<String, Value> = map
                .into_iter()
                .map(|(k, val)| (k, canonicalize(val)))
                .collect();
            let mut out = serde_json::Map::new();
            for (k, val) in sorted {
                out.insert(k, val);
            }
            Value::Object(out)
        }
        Value::Array(arr) => Value::Array(arr.into_iter().map(canonicalize).collect()),
        other => other,
    }
}

fn format_sha256(bytes: &[u8]) -> String {
    let digest = Sha256::digest(bytes);
    let mut out = String::with_capacity(7 + 64);
    out.push_str("sha256:");
    for byte in digest {
        let _ = write!(out, "{byte:02x}");
    }
    out
}

/// Hash the running binary at `std::env::current_exe()` and return the
/// `"sha256:<64-hex>"` string used by
/// [`crate::report::periodic::schema::Integrity::binary_hash`].
///
/// Streams the file via a `BufReader` (no whole-binary allocation) and
/// caps the read at `BINARY_HASH_MAX_BYTES` so an unexpectedly large
/// `current_exe` resolution cannot OOM the process.
///
/// # Errors
///
/// Returns the I/O error from `current_exe` or the file read when the
/// running executable cannot be resolved or read.
pub fn binary_hash() -> std::io::Result<String> {
    let path = std::env::current_exe()?;
    let file = std::fs::File::open(&path)?;
    let total_len = file.metadata().map_or(0, |m| m.len());
    if total_len > BINARY_HASH_MAX_BYTES {
        return Err(std::io::Error::new(
            std::io::ErrorKind::InvalidData,
            format!(
                "binary at {} exceeds {} byte cap ({} bytes), refusing to hash a truncated view",
                path.display(),
                BINARY_HASH_MAX_BYTES,
                total_len
            ),
        ));
    }
    let mut reader = std::io::BufReader::new(file).take(BINARY_HASH_MAX_BYTES);
    let mut hasher = Sha256::new();
    let mut buf = [0u8; 8192];
    loop {
        let n = reader.read(&mut buf)?;
        if n == 0 {
            break;
        }
        hasher.update(&buf[..n]);
    }
    let digest = hasher.finalize();
    let mut out = String::with_capacity(7 + 64);
    out.push_str("sha256:");
    for byte in digest {
        let _ = write!(out, "{byte:02x}");
    }
    Ok(out)
}

/// Hash an arbitrary file by path and return the 64-hex SHA-256 digest
/// (without the `sha256:` prefix, to match the in-toto v1 subject digest
/// convention). Streams via the same `BUF` size and `BINARY_HASH_MAX_BYTES`
/// cap as [`binary_hash`].
///
/// # Errors
///
/// Returns the I/O error from opening or reading the file, or
/// `InvalidData` if the file exceeds the safety cap.
pub fn compute_file_sha256_hex(path: &std::path::Path) -> std::io::Result<String> {
    let file = std::fs::File::open(path)?;
    let total_len = file.metadata().map_or(0, |m| m.len());
    if total_len > BINARY_HASH_MAX_BYTES {
        return Err(std::io::Error::new(
            std::io::ErrorKind::InvalidData,
            format!(
                "file at {} exceeds {} byte cap ({} bytes)",
                path.display(),
                BINARY_HASH_MAX_BYTES,
                total_len
            ),
        ));
    }
    let mut reader = std::io::BufReader::new(file).take(BINARY_HASH_MAX_BYTES);
    let mut hasher = Sha256::new();
    let mut buf = [0u8; 8192];
    loop {
        let n = reader.read(&mut buf)?;
        if n == 0 {
            break;
        }
        hasher.update(&buf[..n]);
    }
    let digest = hasher.finalize();
    let mut out = String::with_capacity(64);
    for byte in digest {
        let _ = write!(out, "{byte:02x}");
    }
    Ok(out)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::report::periodic::schema::{
        Application, Confidentiality, PeriodicReport, ReportIntent,
    };
    use crate::report::periodic::test_fixtures;

    fn sample_report() -> PeriodicReport {
        test_fixtures::sample_report(
            ReportIntent::Official,
            Confidentiality::Public,
            vec![Application::G1(test_fixtures::sample_g1_application())],
        )
    }

    #[test]
    fn hash_is_deterministic() {
        let r = sample_report();
        let first = compute_content_hash(&r).unwrap();
        for _ in 0..100 {
            assert_eq!(compute_content_hash(&r).unwrap(), first);
        }
        assert!(first.starts_with("sha256:"));
        assert_eq!(first.len(), 7 + 64);
        assert!(first[7..].chars().all(|c| c.is_ascii_hexdigit()));
    }

    #[test]
    fn content_hash_survives_json_roundtrip() {
        // verify-hash reparses the report from a file before recomputing the
        // hash, so the hash must be stable across serialize -> parse. With
        // messy floats (long mantissas, the kind disclose produces) the
        // default serde_json parser can shift a value by 1 ULP and break the
        // hash of an untampered report. The `float_roundtrip` feature makes
        // parsing exact. This guards against that feature being dropped.
        let mut r = sample_report();
        r.aggregate.total_energy_kwh = 2.0 / 3.0;
        r.aggregate.total_carbon_kgco2eq = 1.0 / 7.0;
        r.aggregate.canonical_waste.energy_kwh = 100.0 * 5.0 / 6.0 * 1e-7;
        r.aggregate.canonical_waste.carbon_kgco2eq = 10.0 * 5.0 / 6.0 / 1000.0;
        r.aggregate.operational_waste.energy_kwh = 1.0 / 11.0;

        let before = compute_content_hash(&r).unwrap();
        let json = serde_json::to_string(&r).unwrap();
        let reparsed: PeriodicReport = serde_json::from_str(&json).unwrap();
        let after = compute_content_hash(&reparsed).unwrap();

        assert_eq!(before, after, "content_hash must survive a JSON round-trip");
    }

    #[test]
    fn hash_changes_on_aggregate_mutation() {
        let r = sample_report();
        let baseline = compute_content_hash(&r).unwrap();

        let mut mutated = r.clone();
        mutated.aggregate.total_energy_kwh += 0.000_001;
        let after = compute_content_hash(&mutated).unwrap();

        assert_ne!(baseline, after);
    }

    #[test]
    fn hash_ignores_existing_content_hash() {
        let mut r = sample_report();
        r.integrity.content_hash = "sha256:aaaa".to_string();
        let first = compute_content_hash(&r).unwrap();

        r.integrity.content_hash = "sha256:bbbb".to_string();
        let second = compute_content_hash(&r).unwrap();

        assert_eq!(first, second);
    }

    #[test]
    fn canonicalize_is_key_order_invariant() {
        let a = serde_json::json!({ "alpha": 1, "beta": 2, "gamma": 3 });
        let b = serde_json::json!({ "gamma": 3, "alpha": 1, "beta": 2 });
        let ca = canonicalize(a);
        let cb = canonicalize(b);
        assert_eq!(
            serde_json::to_vec(&ca).unwrap(),
            serde_json::to_vec(&cb).unwrap()
        );
    }

    #[test]
    fn canonicalize_recurses_into_nested_objects() {
        let a = serde_json::json!({
            "outer": { "z": 1, "a": 2 },
            "list": [{ "b": 1, "a": 2 }]
        });
        let b = serde_json::json!({
            "list": [{ "a": 2, "b": 1 }],
            "outer": { "a": 2, "z": 1 }
        });
        assert_eq!(
            serde_json::to_vec(&canonicalize(a)).unwrap(),
            serde_json::to_vec(&canonicalize(b)).unwrap(),
        );
    }

    #[test]
    fn hash_blanks_content_hash_without_removing_key() {
        let r = sample_report();
        let mut v = serde_json::to_value(&r).unwrap();
        blank_content_hash(&mut v);
        let integrity = v.get("integrity").and_then(Value::as_object).unwrap();
        assert!(integrity.contains_key("content_hash"));
        assert_eq!(
            integrity.get("content_hash"),
            Some(&Value::String(String::new()))
        );
    }

    #[test]
    fn format_sha256_known_vector() {
        // SHA-256 of the empty string, well-known constant.
        let empty = format_sha256(&[]);
        assert_eq!(
            empty,
            "sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
        );
    }

    #[test]
    fn post_sign_fields_inventory_is_locked() {
        // Pinning the exact set so an unintended removal fails a test
        // rather than silently shrinking the hash invariant. If you
        // add a new post-sign field, update both the const and this
        // expectation.
        let expected: &[(&str, &str)] = &[
            ("integrity", "content_hash"),
            ("integrity", "signature"),
            ("integrity", "binary_attestation"),
            ("report_metadata", "integrity_level"),
        ];
        let actual: Vec<(&str, &str)> = POST_SIGN_FIELDS.iter().map(|(p, f, _)| (*p, *f)).collect();
        assert_eq!(actual, expected);
    }

    #[test]
    fn hash_changes_when_any_non_blanked_field_mutates() {
        // Negative-cover: mutating a field that is NOT in
        // POST_SIGN_FIELDS must change the hash. Table-driven so
        // a future over-broad blanking on any of these sibling
        // objects gets caught (an attacker tampering with the
        // organisation, the aggregate totals, the methodology
        // header, or the period bounds must invalidate the
        // signature).
        use chrono::Datelike as _;
        let r = sample_report();
        let baseline = compute_content_hash(&r).unwrap();

        let mut m = r.clone();
        m.organisation.name = format!("{}-edited", m.organisation.name);
        assert_ne!(
            compute_content_hash(&m).unwrap(),
            baseline,
            "organisation.name"
        );

        let mut m = r.clone();
        m.aggregate.total_energy_kwh += 1.0;
        assert_ne!(
            compute_content_hash(&m).unwrap(),
            baseline,
            "aggregate.total_energy_kwh"
        );

        let mut m = r.clone();
        m.aggregate.canonical_waste.n_plus_one_threshold += 1;
        assert_ne!(
            compute_content_hash(&m).unwrap(),
            baseline,
            "aggregate.canonical_waste.n_plus_one_threshold"
        );

        let mut m = r.clone();
        m.methodology.sci_specification = format!("{}-v2", m.methodology.sci_specification);
        assert_ne!(
            compute_content_hash(&m).unwrap(),
            baseline,
            "methodology.sci_specification"
        );

        let mut m = r.clone();
        m.period.from_date = m
            .period
            .from_date
            .with_day(m.period.from_date.day().wrapping_add(1).min(28))
            .unwrap();
        assert_ne!(
            compute_content_hash(&m).unwrap(),
            baseline,
            "period.from_date"
        );
    }

    #[test]
    fn hash_is_invariant_under_post_sign_locator_addition() {
        use crate::report::periodic::schema::{
            BinaryAttestationMetadata, IntegrityLevel, SignatureMetadata,
        };
        // The operator workflow adds `integrity.signature`,
        // `integrity.binary_attestation`, and bumps `integrity_level`
        // AFTER `disclose` has already committed `content_hash`. The
        // canonical form must be invariant under those edits so a
        // signed disclosure still verifies.
        let r = sample_report();
        let baseline = compute_content_hash(&r).unwrap();

        let mut signed = r.clone();
        signed.report_metadata.integrity_level = IntegrityLevel::Signed;
        signed.integrity.signature = Some(SignatureMetadata {
            format: "sigstore-cosign-intoto-v1".to_string(),
            bundle_url: "https://example.fr/bundle.sig".to_string(),
            signer_identity: "ci@example.fr".to_string(),
            signer_issuer: "https://accounts.google.com".to_string(),
            rekor_url: "https://rekor.sigstore.dev".to_string(),
            rekor_log_index: 42,
            signed_at: "2026-05-14T12:00:00Z".to_string(),
        });
        assert_eq!(compute_content_hash(&signed).unwrap(), baseline);

        signed.report_metadata.integrity_level = IntegrityLevel::SignedWithAttestation;
        signed.integrity.binary_attestation = Some(BinaryAttestationMetadata {
            format: "slsa-provenance-v1".to_string(),
            attestation_url: "https://gh/p.intoto.jsonl".to_string(),
            builder_id: "https://github.com/actions/runner".to_string(),
            git_tag: "v0.7.0".to_string(),
            git_commit: "deadbeef".to_string(),
            slsa_level: "L2".to_string(),
        });
        assert_eq!(compute_content_hash(&signed).unwrap(), baseline);
    }

    #[test]
    fn hash_unaffected_by_absent_v1_2_fields_but_changes_when_populated() {
        use crate::report::periodic::schema::TemporalCoverage;
        let r = sample_report();
        let baseline = compute_content_hash(&r).unwrap();

        // The v1.2 additions are absent/default in sample_report, so the
        // canonical form is byte-identical to a pre-v1.2 report: a re-hash of a
        // legacy report keeps its content_hash.
        let v = serde_json::to_value(&r).unwrap();
        assert!(v["aggregate"].get("temporal_coverage").is_none());
        assert!(v["scope_manifest"].get("coverage_basis").is_none());
        assert!(v["integrity"].get("cross_period_log").is_none());

        // Explicitly setting the defaults must not change the hash.
        let mut same = r.clone();
        same.aggregate.temporal_coverage = TemporalCoverage::default();
        assert_eq!(compute_content_hash(&same).unwrap(), baseline);

        // temporal_coverage is disclosed content, NOT a post-sign field: a real
        // value must change the hash so it cannot be silently swapped.
        let mut populated = r;
        populated.aggregate.temporal_coverage = TemporalCoverage {
            temporal_coverage: 0.5,
            observed_days: 45,
            days_in_period: 90,
            largest_gap_days: 10,
        };
        assert_ne!(compute_content_hash(&populated).unwrap(), baseline);
    }
}