taudit_core/
finding.rs

1use crate::graph::{AuthorityGraph, NodeId, NodeKind};
2use crate::propagation::PropagationPath;
3use serde::{Deserialize, Serialize};
4use sha2::{Digest, Sha256};
5use std::path::PathBuf;
6
7// ── Finding-output enhancements (v0.10) ────────────────────────────
8//
9// The blue-team corpus defense report (Section 3) recommends a small
10// set of additive `Finding` fields that consumers (SIEMs, dashboards,
11// triage queues) need but cannot derive cheaply. They are:
12//
13//   * `finding_group_id`       — stable UUID v5 over (namespace, fingerprint)
14//                                 so N hops against one secret cluster into
15//                                 a single advisory in downstream tooling.
16//   * `time_to_fix`             — coarse remediation effort enum so triage
17//                                 dashboards can sort by severity * effort.
18//   * `compensating_controls`   — human-readable list of detected controls
19//                                 that downgraded the finding's severity.
20//   * `suppressed`              — set by the `.taudit-suppressions.yml`
21//                                 applicator; preserves audit trail when a
22//                                 finding has been waived rather than fixed.
23//   * `original_severity`       — pre-downgrade severity; populated whenever
24//                                 the suppression applicator OR a compensating
25//                                 control modifies `severity`.
26//   * `suppression_reason`      — operator-supplied justification from the
27//                                 matching `.taudit-suppressions.yml` entry.
28//
29// All six fields live on `FindingExtras` and are flattened into JSON / SARIF
30// output via `#[serde(flatten)]`. New rules can populate them via
31// `Finding::with_time_to_fix(...)` / `Finding::with_compensating_controls(...)`
32// without touching the 31+ existing rule sites.
33
34#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
35#[serde(rename_all = "snake_case")]
36pub enum Severity {
37    Critical,
38    High,
39    Medium,
40    Low,
41    Info,
42}
43
44impl Severity {
45    fn rank(self) -> u8 {
46        match self {
47            Severity::Critical => 0,
48            Severity::High => 1,
49            Severity::Medium => 2,
50            Severity::Low => 3,
51            Severity::Info => 4,
52        }
53    }
54}
55
56impl Ord for Severity {
57    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
58        self.rank().cmp(&other.rank())
59    }
60}
61
62impl PartialOrd for Severity {
63    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
64        Some(self.cmp(other))
65    }
66}
67
68/// MVP categories (1-5) are derivable from pipeline YAML alone.
69/// Stretch categories (6-9) need heuristics or metadata enrichment.
70#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
71#[serde(rename_all = "snake_case")]
72pub enum FindingCategory {
73    // MVP
74    AuthorityPropagation,
75    OverPrivilegedIdentity,
76    UnpinnedAction,
77    UntrustedWithAuthority,
78    ArtifactBoundaryCrossing,
79    // Stretch — implemented
80    FloatingImage,
81    LongLivedCredential,
82    /// Credential written to disk by a step (e.g. `persistCredentials: true` on a checkout).
83    /// Disk-persisted credentials are accessible to all subsequent steps and any process
84    /// with filesystem access, unlike runtime-only `HasAccessTo` authority.
85    PersistedCredential,
86    /// Dangerous trigger type (pull_request_target / pr) combined with secret/identity access.
87    TriggerContextMismatch,
88    /// Authority (secret/identity) flows into an opaque external workflow via DelegatesTo.
89    CrossWorkflowAuthorityChain,
90    /// Circular DelegatesTo chain — workflow calls itself transitively.
91    AuthorityCycle,
92    /// Privileged workflow (OIDC/broad identity) with no provenance attestation step.
93    UpliftWithoutAttestation,
94    /// Step writes to the environment gate ($GITHUB_ENV, pipeline variables) — authority can propagate.
95    SelfMutatingPipeline,
96    /// PR-triggered pipeline checks out the repository — attacker-controlled fork code lands on the runner.
97    CheckoutSelfPrExposure,
98    /// ADO variable group consumed by a PR-triggered job, crossing trust boundary.
99    VariableGroupInPrJob,
100    /// Self-hosted agent pool used in a PR-triggered job that also checks out the repository.
101    SelfHostedPoolPrHijack,
102    /// Broad-scope ADO service connection reachable from a PR-triggered job without OIDC.
103    ServiceConnectionScopeMismatch,
104    /// ADO `resources.repositories[]` entry referenced by an `extends:`,
105    /// `template: x@alias`, or `checkout: alias` consumer resolves with no
106    /// `ref:` (default branch) or a mutable branch ref (`refs/heads/<name>`).
107    /// Whoever owns that branch can inject steps into the consuming pipeline.
108    TemplateExtendsUnpinnedBranch,
109    /// ADO `resources.repositories[]` entry pinned to a feature-class branch
110    /// (anything outside the `main` / `master` / `release/*` / `hotfix/*`
111    /// platform set). Feature branches typically have weaker push protection
112    /// than the trunk, so any developer with write access to that branch can
113    /// inject pipeline YAML that runs with the consumer's authority. Strictly
114    /// stronger signal than `template_extends_unpinned_branch` — co-fires.
115    TemplateRepoRefIsFeatureBranch,
116    /// Pipeline step uses an Azure VM remote-exec primitive (Set-AzVMExtension /
117    /// CustomScriptExtension, Invoke-AzVMRunCommand, az vm run-command, az vm extension set)
118    /// where the executed command line interpolates a pipeline secret or a SAS token —
119    /// pipeline-to-VM lateral movement primitive logged in plaintext to the VM and ARM.
120    VmRemoteExecViaPipelineSecret,
121    /// A SAS token freshly minted in-pipeline is interpolated into a CLI argument
122    /// (commandToExecute / scriptArguments / --arguments / -ArgumentList) instead of
123    /// passed via env var or stdin — argv ends up in /proc/*/cmdline, ETW, ARM status.
124    ShortLivedSasInCommandLine,
125    /// Pipeline secret value assigned to a shell variable inside an inline
126    /// script (`export VAR=$(SECRET)`, `$X = "$(SECRET)"`). Once the value
127    /// transits a shell variable, ADO's `$(SECRET)` log mask no longer
128    /// applies — transcripts (`Start-Transcript`, `bash -x`, terraform debug
129    /// logs) print the cleartext.
130    SecretToInlineScriptEnvExport,
131    /// Pipeline secret value written to a file under the agent workspace
132    /// (`$(System.DefaultWorkingDirectory)`, `$(Build.SourcesDirectory)`,
133    /// or relative paths) without `secureFile` task or chmod 600. The file
134    /// persists in the agent workspace and is uploaded by
135    /// `PublishPipelineArtifact` and crawlable by later steps.
136    SecretMaterialisedToWorkspaceFile,
137    /// PowerShell pulls a Key Vault secret with `-AsPlainText` (or
138    /// `ConvertFrom-SecureString -AsPlainText`, or older
139    /// `.SecretValueText` syntax) into a non-`SecureString` variable. The
140    /// value never traverses the ADO variable-group boundary, so verbose
141    /// Az/PS logging and error stack traces print the credential.
142    ///
143    /// Rule id is `keyvault_secret_to_plaintext` (single token "keyvault")
144    /// rather than the snake_case derivation `key_vault_…` — matches the
145    /// docs filename and the convention used in the corpus evidence.
146    #[serde(rename = "keyvault_secret_to_plaintext")]
147    KeyVaultSecretToPlaintext,
148    /// `terraform apply -auto-approve` against a production-named service connection
149    /// without an environment approval gate.
150    TerraformAutoApproveInProd,
151    /// `AzureCLI@2` task with `addSpnToEnvironment: true` AND an inline script —
152    /// the script can launder federated SPN/OIDC tokens into pipeline variables.
153    AddSpnWithInlineScript,
154    /// A `type: string` pipeline parameter (no `values:` allowlist) is interpolated
155    /// via `${{ parameters.X }}` into an inline shell/PowerShell script body —
156    /// shell injection vector for anyone with "queue build".
157    ParameterInterpolationIntoShell,
158    /// A `run:` block fetches a remote script from a mutable URL (`refs/heads/`,
159    /// `/main/`, `/master/`) and pipes it directly to a shell interpreter
160    /// (`curl … | bash`, `wget … | sh`, `bash <(curl …)`, `deno run https://…`).
161    /// Whoever controls that URL's content controls execution on the runner.
162    RuntimeScriptFetchedFromFloatingUrl,
163    /// Workflow trigger combines high-authority PR events
164    /// (`pull_request_target`, `issue_comment`, or `workflow_run`) with a step
165    /// whose `uses:` ref is a mutable branch/tag (not a 40-char SHA). Compromise
166    /// of the action's default branch yields full repo write on the target repo.
167    PrTriggerWithFloatingActionRef,
168    /// A `workflow_run`-triggered workflow captures a value from an external
169    /// API response (`gh pr view`, `gh api`, `curl api.github.com`) and writes
170    /// it into `$GITHUB_ENV`/`$GITHUB_OUTPUT`/`$GITHUB_PATH` without sanitisation.
171    /// A poisoned API field (branch name, title) injects environment variables
172    /// into every subsequent step in the same job.
173    UntrustedApiResponseToEnvSink,
174    /// A `pull_request`-triggered workflow logs into a container registry via a
175    /// floating (non-SHA-pinned) login action. The compromised action receives
176    /// OIDC tokens or registry credentials, and the workflow then pushes a
177    /// PR-controlled image to a shared registry.
178    PrBuildPushesImageWithFloatingCredentials,
179    /// First-party step writes a Secret/Identity-derived value into the
180    /// `$GITHUB_ENV` gate (or pipeline-variable equivalent) and a *later*
181    /// step in the same job that runs in `Untrusted` or `ThirdParty` trust
182    /// zone reads from the runner-managed env (`${{ env.X }}`). The two
183    /// component rules — `self_mutating_pipeline` (writer) and
184    /// `untrusted_with_authority` (consumer) — each see only half the
185    /// chain and emit no finding for the laundered consumer; this rule
186    /// closes the composition gap that R2 attack #3 exploited.
187    SecretViaEnvGateToUntrustedConsumer,
188    /// Positive-invariant rule (GHA): the workflow declares neither a
189    /// top-level nor a per-job `permissions:` block, leaving GITHUB_TOKEN at
190    /// its broad platform default. Fires once per workflow file.
191    NoWorkflowLevelPermissionsBlock,
192    /// Positive-invariant rule (ADO): a job referencing a production-named
193    /// service connection has no `environment:` binding, so it bypasses the
194    /// only ADO-side approval gate regardless of whether `-auto-approve` is
195    /// present. Strictly broader than `terraform_auto_approve_in_prod`.
196    ProdDeployJobNoEnvironmentGate,
197    /// Positive-invariant rule (cross-platform): a long-lived static
198    /// credential is in scope but the workflow does not currently use any
199    /// OIDC identity even though the target cloud supports federation.
200    /// Advisory uplift on top of `long_lived_credential` that wires the
201    /// existing `Recommendation::FederateIdentity` variant.
202    LongLivedSecretWithoutOidcRecommendation,
203    /// Positive-invariant rule (GHA): a PR-triggered workflow has multiple
204    /// privileged jobs where SOME have the standard fork-check `if:` and
205    /// OTHERS do not. Detects an intra-file inconsistency in defensive
206    /// posture — the org has the right instinct but applied it unevenly.
207    PullRequestWorkflowInconsistentForkCheck,
208    /// Positive-invariant rule (GitLab): a job with a production-named
209    /// `environment:` binding has no `rules:` / `only:` clause restricting
210    /// it to protected branches. Deploy job runs (or attempts to run) on
211    /// every pipeline trigger.
212    GitlabDeployJobMissingProtectedBranchOnly,
213    // Reserved — requires ADO/GH API enrichment beyond pipeline YAML
214    /// Requires runtime network telemetry or policy enrichment — not detectable from YAML alone.
215    #[doc(hidden)]
216    EgressBlindspot,
217    /// Requires external audit-sink configuration data — not detectable from YAML alone.
218    #[doc(hidden)]
219    MissingAuditTrail,
220}
221
222/// Routing: scope findings -> TsafeRemediation; isolation findings -> CellosRemediation.
223#[derive(Debug, Clone, Serialize, Deserialize)]
224#[serde(tag = "type", rename_all = "snake_case")]
225pub enum Recommendation {
226    TsafeRemediation {
227        command: String,
228        explanation: String,
229    },
230    CellosRemediation {
231        reason: String,
232        spec_hint: String,
233    },
234    PinAction {
235        current: String,
236        pinned: String,
237    },
238    ReducePermissions {
239        current: String,
240        minimum: String,
241    },
242    FederateIdentity {
243        static_secret: String,
244        oidc_provider: String,
245    },
246    Manual {
247        action: String,
248    },
249}
250
251/// Provenance of a finding — distinguishes findings emitted by built-in
252/// taudit rules from findings emitted by user-loaded custom invariant YAML
253/// (`--invariants-dir`). Custom rules can emit arbitrarily-worded findings
254/// at any severity, so an operator piping output into a JIRA workflow or
255/// SARIF upload needs a non-spoofable signal of which file the rule came
256/// from. Serializes as `"built-in"` (string) for built-in findings and
257/// `{"custom": "<path>"}` for custom-rule findings — see
258/// `docs/finding-fingerprint.md` for the contract.
259#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
260#[serde(rename_all = "snake_case")]
261pub enum FindingSource {
262    /// Emitted by a built-in rule defined in `taudit-core::rules`. The
263    /// authoritative trust anchor — the binary's release commit defines the
264    /// rule logic. Serialises as the kebab-case string `"built-in"` to match
265    /// `schemas/finding.v1.json`.
266    #[default]
267    #[serde(rename = "built-in")]
268    BuiltIn,
269    /// Emitted by a custom invariant rule loaded from the given YAML file.
270    /// The path is the file the rule was loaded from, retained so operators
271    /// can audit which file produced any given finding.
272    Custom { source_file: PathBuf },
273}
274
275impl FindingSource {
276    /// True for findings emitted by built-in rules.
277    pub fn is_built_in(&self) -> bool {
278        matches!(self, FindingSource::BuiltIn)
279    }
280}
281
282/// Coarse-grained remediation effort. Surfaces in JSON `time_to_fix` and SARIF
283/// `properties.timeToFix` so triage dashboards can sort by `severity * effort`.
284///
285/// The four buckets are deliberately wide. Precise time estimates would invite
286/// argument; the buckets exist to separate "flip a flag" from "rewrite a job"
287/// from "renegotiate ops policy".
288///
289/// Per `MEMORY/.../blueteam-corpus-defense.md` Section 3 / Enhancement E-3.
290#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
291#[serde(rename_all = "snake_case")]
292pub enum FixEffort {
293    /// ~5 minutes. Mechanical change to a single file (flip a flag, pin a SHA,
294    /// add a `permissions: {}` block). No structural risk.
295    Trivial,
296    /// ~1 hour. Refactor a step or job: split a script, add a fork-check,
297    /// move a secret to an environment binding.
298    Small,
299    /// ~1 day. Restructure a job or pipeline: introduce an environment gate,
300    /// move from inline scripts to a sandboxed action, add an OIDC role.
301    Medium,
302    /// ~1 week or more. Operational policy change: migrate from PATs to OIDC
303    /// across an org, change branch protection model, retire a service principal.
304    Large,
305}
306
307/// Optional finding metadata. Lives on every `Finding` via
308/// `#[serde(flatten)]` so consumers see the fields at the top of the
309/// finding object — same place they'd appear if declared inline on
310/// `Finding`. Default-constructed extras serialize to nothing (all
311/// `Option::None` and empty `Vec`s skip-serialize), so existing
312/// snapshots remain byte-stable until a rule populates a field.
313///
314/// **Why a wrapper struct?** The 30+ rule call sites use struct
315/// literal syntax. Adding fields directly to `Finding` would force
316/// every site to edit. With `extras: FindingExtras::default()`, new
317/// extras can be added in a single place.
318#[derive(Debug, Clone, Default, Serialize, Deserialize)]
319pub struct FindingExtras {
320    /// Stable UUID v5 over `(NAMESPACE, fingerprint)` — collapses
321    /// per-hop findings against the same authority root into one group
322    /// for SIEM display. See `compute_finding_group_id`.
323    #[serde(default, skip_serializing_if = "Option::is_none")]
324    pub finding_group_id: Option<String>,
325
326    /// Coarse remediation effort. See `FixEffort`.
327    #[serde(default, skip_serializing_if = "Option::is_none")]
328    pub time_to_fix: Option<FixEffort>,
329
330    /// Human-readable list of controls that already neutralise (or partially
331    /// neutralise) this finding — populated when a compensating-control
332    /// detector downgrades severity. Empty when no downgrade applied.
333    #[serde(default, skip_serializing_if = "Vec::is_empty")]
334    pub compensating_controls: Vec<String>,
335
336    /// Set to `true` by the suppression applicator when a matching
337    /// `.taudit-suppressions.yml` entry exists AND the configured mode
338    /// is `Suppress`. The finding still appears in output (audit trail
339    /// preserved) but consumers can filter on this field.
340    #[serde(default, skip_serializing_if = "is_false")]
341    pub suppressed: bool,
342
343    /// Original pre-downgrade severity. Populated by the suppression
344    /// applicator OR a compensating-control detector when `severity`
345    /// is mutated. `None` means the current severity is the rule-emitted
346    /// value.
347    #[serde(default, skip_serializing_if = "Option::is_none")]
348    pub original_severity: Option<Severity>,
349
350    /// Operator-supplied justification from the matching suppression
351    /// entry. `None` when no suppression applies.
352    #[serde(default, skip_serializing_if = "Option::is_none")]
353    pub suppression_reason: Option<String>,
354}
355
356#[allow(clippy::trivially_copy_pass_by_ref)]
357fn is_false(b: &bool) -> bool {
358    !*b
359}
360
361/// A finding is a concrete, actionable authority issue.
362#[derive(Debug, Clone, Serialize, Deserialize)]
363pub struct Finding {
364    pub severity: Severity,
365    pub category: FindingCategory,
366    #[serde(skip_serializing_if = "Option::is_none")]
367    pub path: Option<PropagationPath>,
368    pub nodes_involved: Vec<NodeId>,
369    pub message: String,
370    pub recommendation: Recommendation,
371    /// Provenance of this finding. Defaults to `BuiltIn` for backward
372    /// compatibility with code/JSON that predates the field — every
373    /// in-tree built-in rule sets this explicitly. Deserialization of older
374    /// JSON without the field treats the finding as built-in.
375    #[serde(default)]
376    pub source: FindingSource,
377    /// Optional metadata (group id, time-to-fix, compensating controls,
378    /// suppression markers). Flattens into the JSON object so consumers
379    /// see top-level fields — see `FindingExtras` for individual semantics.
380    #[serde(flatten, default)]
381    pub extras: FindingExtras,
382}
383
384impl Finding {
385    /// Builder helper: attach a `time_to_fix` annotation to this finding.
386    /// Call sites: `let f = Finding { ... }.with_time_to_fix(FixEffort::Trivial);`
387    pub fn with_time_to_fix(mut self, effort: FixEffort) -> Self {
388        self.extras.time_to_fix = Some(effort);
389        self
390    }
391
392    /// Builder helper: append a compensating control description and
393    /// downgrade severity by one tier (Critical -> High -> Medium -> Low -> Info).
394    /// Records the original severity so the audit trail survives.
395    pub fn with_compensating_control(mut self, control: impl Into<String>) -> Self {
396        let original = self.severity;
397        self.extras.compensating_controls.push(control.into());
398        self.severity = downgrade_severity(self.severity);
399        if self.extras.original_severity.is_none() {
400            self.extras.original_severity = Some(original);
401        }
402        self
403    }
404}
405
406/// Move severity one rank toward `Info` (Critical -> High -> ... -> Info).
407/// `Info` stays `Info`. Used by both the suppression applicator and
408/// compensating-control detectors.
409pub fn downgrade_severity(s: Severity) -> Severity {
410    match s {
411        Severity::Critical => Severity::High,
412        Severity::High => Severity::Medium,
413        Severity::Medium => Severity::Low,
414        Severity::Low => Severity::Info,
415        Severity::Info => Severity::Info,
416    }
417}
418
419/// Stable UUID v5 over the finding fingerprint. Two findings whose
420/// fingerprints match (same rule + file + root authority) produce the
421/// same `finding_group_id` — that is the whole point: SIEMs and triage
422/// dashboards collapse N hops against a single secret into one row.
423///
424/// The UUID v5 namespace is a fixed UUID v4 derived once and embedded
425/// here. Treating the namespace as load-bearing is intentional: any
426/// future change here would break every consumer that has stored a
427/// `finding_group_id`. Bump only at a major version.
428pub fn compute_finding_group_id(fingerprint: &str) -> String {
429    // UUID v5 = SHA-1(namespace || name), with version + variant bits set.
430    // Implemented inline so taudit-core stays free of the `uuid` crate
431    // dependency (workspace already depends on it from the CLI; core
432    // remains zero-IO and minimal).
433    const NAMESPACE: [u8; 16] = [
434        0x6c, 0x6f, 0xd0, 0xa3, 0x82, 0x44, 0x4f, 0x29, 0xb1, 0x9a, 0x09, 0xc8, 0x7e, 0x49, 0x55,
435        0x21,
436    ];
437
438    use sha1::{Digest as Sha1Digest, Sha1};
439    let mut hasher = Sha1::new();
440    Sha1Digest::update(&mut hasher, NAMESPACE);
441    Sha1Digest::update(&mut hasher, fingerprint.as_bytes());
442    let hash = hasher.finalize();
443
444    let mut bytes = [0u8; 16];
445    bytes.copy_from_slice(&hash[..16]);
446    // RFC 4122 §4.3: set version to 5 (bits 12-15 of time_hi_and_version)
447    bytes[6] = (bytes[6] & 0x0f) | 0x50;
448    // RFC 4122 §4.4: set variant to RFC 4122 (bits 6-7 of clock_seq_hi)
449    bytes[8] = (bytes[8] & 0x3f) | 0x80;
450
451    format!(
452        "{:02x}{:02x}{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}",
453        bytes[0], bytes[1], bytes[2], bytes[3],
454        bytes[4], bytes[5],
455        bytes[6], bytes[7],
456        bytes[8], bytes[9],
457        bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15],
458    )
459}
460
461// ── Finding fingerprint ────────────────────────────────────
462//
463// Stable cross-run identifier for a finding. Surfaces in:
464//
465//   * SARIF `partialFingerprints[primaryLocationLineHash]`
466//   * JSON  `findings[].fingerprint`
467//   * CloudEvents extension attribute `tauditfindingfingerprint`
468//
469// SIEMs / suppression DBs / dedup pipelines key on this value to
470// recognise "same finding seen on previous run". See
471// `docs/finding-fingerprint.md` for the full contract.
472
473/// Pull a custom-rule id out of a finding message of the form
474/// `[<id>] rest of message`. Returns `None` if the message does not start
475/// with a bracketed id. Mirrors the matching helper in
476/// `taudit-report-sarif`; kept private so the surface stays minimal.
477fn extract_custom_rule_id(message: &str) -> Option<&str> {
478    if !message.starts_with('[') {
479        return None;
480    }
481    let end = message.find(']')?;
482    let id = &message[1..end];
483    if id.is_empty() {
484        None
485    } else {
486        Some(id)
487    }
488}
489
490/// Snake-case rule id derived from a `FindingCategory`. Delegates to
491/// serde so the value tracks the serialized form across renames.
492fn category_rule_id(category: &FindingCategory) -> String {
493    serde_json::to_value(category)
494        .ok()
495        .and_then(|v| v.as_str().map(str::to_string))
496        .unwrap_or_else(|| "unknown".to_string())
497}
498
499/// Public, stable rule-id resolver for a finding.
500///
501/// Returns the snake_case rule id reported alongside this finding. When the
502/// finding's message starts with a bracketed custom-rule prefix
503/// (`[my_rule] ...`), the bracketed id wins so custom YAML rules surface
504/// their declared id. Otherwise the rule id is the snake_case form of the
505/// finding's `category` (the same string serde uses to serialize the
506/// category enum).
507///
508/// JSON, SARIF, and CloudEvents emitters all share this helper to ensure
509/// the `rule_id` field is identical across the three sinks.
510pub fn rule_id_for(finding: &Finding) -> String {
511    extract_custom_rule_id(&finding.message)
512        .map(str::to_string)
513        .unwrap_or_else(|| category_rule_id(&finding.category))
514}
515
516/// Compute a stable cross-run fingerprint for a finding.
517///
518/// The fingerprint identifies "the same logical issue" across re-runs and
519/// across non-cosmetic edits to the surrounding pipeline. Two runs against
520/// the same input file produce the same fingerprint; a fix to the
521/// underlying issue makes the fingerprint disappear; a tweak to the
522/// finding's user-facing message does NOT change the fingerprint.
523///
524/// **Algorithm version `v2`** (replaces v1 from v0.9.1).
525///
526/// v1 collapsed every per-hop finding against the same root Secret/Identity
527/// onto a single fingerprint. That hides genuinely distinct issues — two
528/// untrusted steps reaching the same secret are two separate
529/// remediation-distinct findings, not one. v2 makes every component of the
530/// finding contribute to the hash so unrelated findings cannot alias.
531///
532/// **Inputs (sensitive to):**
533///   * Rule id — either a custom rule id parsed from a `[id] …` message
534///     prefix, or the snake_case form of `finding.category`
535///   * Source file path (`graph.source.file`) — verbatim, never normalised
536///     to a basename, so two pipelines named the same file in different
537///     directories never collide
538///   * Finding category (snake_case)
539///   * Root-authority node name — Secret/Identity name when one is
540///     involved, empty string otherwise. Surfaces the credential identity
541///     in the SIEM context column without being the only differentiator.
542///   * Ordered involved-node names — every node in `nodes_involved`,
543///     joined in original order (preserves caller intent so per-hop
544///     findings against the same secret produce distinct fingerprints).
545///
546/// **Inputs (insensitive to):**
547///   * Wall-clock time
548///   * The finding's `message` text — operators tweak phrasing without
549///     wanting suppressions to break
550///   * `taudit` version string
551///   * Environment / host / cwd
552///   * Pipeline file content hash — only the path matters
553///
554/// Stability guarantee: the v2 algorithm is stable for the v0.10+ line.
555/// Pre-v0.10 (v1 algorithm) suppressions DO NOT carry forward — a one-time
556/// re-baselining is required when upgrading. CHANGELOG and
557/// `docs/finding-fingerprint.md` flag the break explicitly.
558///
559/// Output: SHA-256 of the canonical input string, truncated to the first
560/// 16 hex characters (64 bits — collision-resistant enough for finding
561/// dedup, short enough to be human-glanceable in a SIEM table).
562pub fn compute_fingerprint(finding: &Finding, graph: &AuthorityGraph) -> String {
563    let rule_id = extract_custom_rule_id(&finding.message)
564        .map(str::to_string)
565        .unwrap_or_else(|| category_rule_id(&finding.category));
566
567    let category = category_rule_id(&finding.category);
568    let file = graph.source.file.as_str();
569
570    // Root authority name (if any) — always emitted as its own component,
571    // empty string when no Secret/Identity is involved. Distinct field so
572    // a finding whose root_authority differs from a sibling's is
573    // recognisably different even when the involved-node list happens to
574    // overlap.
575    let root_authority: String = finding
576        .nodes_involved
577        .iter()
578        .filter_map(|id| graph.node(*id))
579        .find(|n| matches!(n.kind, NodeKind::Secret | NodeKind::Identity))
580        .map(|n| n.name.clone())
581        .unwrap_or_default();
582
583    // Ordered involved-node names. Order is preserved (NOT sorted) — for
584    // authority_propagation findings the convention is `[source, sink]`,
585    // so two findings hitting the same secret but reaching different
586    // untrusted steps produce different fingerprints (the v1 collision
587    // class). Empty string when no nodes are involved.
588    let nodes_ordered: String = finding
589        .nodes_involved
590        .iter()
591        .filter_map(|id| graph.node(*id))
592        .map(|n| n.name.as_str())
593        .collect::<Vec<_>>()
594        .join(",");
595
596    // Canonical encoding: every component prefixed with a tag and joined
597    // by `\x1f` (ASCII unit separator) so component boundaries cannot
598    // alias across inputs. Algorithm version baked into the prefix so a
599    // future change to the contract is detectable from the canonical
600    // string alone.
601    let canonical = format!(
602        "v2\x1frule={rule_id}\x1ffile={file}\x1fcategory={category}\x1froot={root_authority}\x1fnodes={nodes_ordered}"
603    );
604
605    let digest = Sha256::digest(canonical.as_bytes());
606    let mut out = String::with_capacity(16);
607    for byte in &digest[..8] {
608        use std::fmt::Write;
609        // 8 bytes -> 16 hex chars
610        let _ = write!(&mut out, "{byte:02x}");
611    }
612    out
613}
614
615#[cfg(test)]
616mod fingerprint_tests {
617    use super::*;
618    use crate::graph::{AuthorityGraph, NodeKind, PipelineSource, TrustZone};
619
620    fn source(file: &str) -> PipelineSource {
621        PipelineSource {
622            file: file.to_string(),
623            repo: None,
624            git_ref: None,
625            commit_sha: None,
626        }
627    }
628
629    fn make_finding(category: FindingCategory, msg: &str, nodes: Vec<NodeId>) -> Finding {
630        Finding {
631            severity: Severity::High,
632            category,
633            path: None,
634            nodes_involved: nodes,
635            message: msg.to_string(),
636            recommendation: Recommendation::Manual {
637                action: "fix it".to_string(),
638            },
639            source: FindingSource::BuiltIn,
640            extras: FindingExtras::default(),
641        }
642    }
643
644    #[test]
645    fn fingerprint_is_stable_across_repeat_calls() {
646        let mut graph = AuthorityGraph::new(source(".github/workflows/ci.yml"));
647        let s = graph.add_node(NodeKind::Secret, "AWS_KEY", TrustZone::FirstParty);
648        let f = make_finding(
649            FindingCategory::AuthorityPropagation,
650            "AWS_KEY reaches third party",
651            vec![s],
652        );
653        let a = compute_fingerprint(&f, &graph);
654        let b = compute_fingerprint(&f, &graph);
655        assert_eq!(a, b, "same finding must hash identically across calls");
656        assert_eq!(a.len(), 16, "fingerprint is 16 hex chars");
657        assert!(a.chars().all(|c| c.is_ascii_hexdigit()));
658    }
659
660    #[test]
661    fn different_files_produce_different_fingerprints() {
662        let mut g_a = AuthorityGraph::new(source("workflows/a.yml"));
663        let mut g_b = AuthorityGraph::new(source("workflows/b.yml"));
664        let s_a = g_a.add_node(NodeKind::Secret, "TOKEN", TrustZone::FirstParty);
665        let s_b = g_b.add_node(NodeKind::Secret, "TOKEN", TrustZone::FirstParty);
666        let f_a = make_finding(FindingCategory::UnpinnedAction, "msg", vec![s_a]);
667        let f_b = make_finding(FindingCategory::UnpinnedAction, "msg", vec![s_b]);
668        assert_ne!(
669            compute_fingerprint(&f_a, &g_a),
670            compute_fingerprint(&f_b, &g_b)
671        );
672    }
673
674    #[test]
675    fn different_rules_produce_different_fingerprints() {
676        let mut graph = AuthorityGraph::new(source(".github/workflows/ci.yml"));
677        let s = graph.add_node(NodeKind::Secret, "AWS_KEY", TrustZone::FirstParty);
678        let f1 = make_finding(FindingCategory::AuthorityPropagation, "msg", vec![s]);
679        let f2 = make_finding(FindingCategory::UntrustedWithAuthority, "msg", vec![s]);
680        assert_ne!(
681            compute_fingerprint(&f1, &graph),
682            compute_fingerprint(&f2, &graph)
683        );
684    }
685
686    #[test]
687    fn message_changes_do_not_affect_fingerprint() {
688        // The whole point of cross-run dedup: an operator can re-word
689        // the message text without breaking SIEM suppressions.
690        let mut graph = AuthorityGraph::new(source(".github/workflows/ci.yml"));
691        let s = graph.add_node(NodeKind::Secret, "AWS_KEY", TrustZone::FirstParty);
692        let f1 = make_finding(
693            FindingCategory::AuthorityPropagation,
694            "old phrasing of the message",
695            vec![s],
696        );
697        let f2 = make_finding(
698            FindingCategory::AuthorityPropagation,
699            "completely different new phrasing",
700            vec![s],
701        );
702        assert_eq!(
703            compute_fingerprint(&f1, &graph),
704            compute_fingerprint(&f2, &graph)
705        );
706    }
707
708    #[test]
709    fn per_hop_findings_against_same_authority_are_distinct() {
710        // v2 contract: a single secret reaching N distinct untrusted steps
711        // produces N distinct fingerprints. Each (secret, step) pair is its
712        // own remediation-distinct finding — collapsing them (the v1
713        // behaviour) hid genuinely different exposure surfaces. SIEMs that
714        // want a per-secret rollup can group on root_authority client-side.
715        let mut graph = AuthorityGraph::new(source(".github/workflows/ci.yml"));
716        let secret = graph.add_node(NodeKind::Secret, "DEPLOY_TOKEN", TrustZone::FirstParty);
717        let step_a = graph.add_node(NodeKind::Step, "deploy[0]", TrustZone::Untrusted);
718        let step_b = graph.add_node(NodeKind::Step, "deploy[1]", TrustZone::Untrusted);
719
720        let f_a = make_finding(
721            FindingCategory::AuthorityPropagation,
722            "DEPLOY_TOKEN reaches deploy[0]",
723            vec![secret, step_a],
724        );
725        let f_b = make_finding(
726            FindingCategory::AuthorityPropagation,
727            "DEPLOY_TOKEN reaches deploy[1]",
728            vec![secret, step_b],
729        );
730        assert_ne!(
731            compute_fingerprint(&f_a, &graph),
732            compute_fingerprint(&f_b, &graph),
733            "per-hop findings against one secret must produce distinct \
734             fingerprints — sink identity is part of the issue"
735        );
736    }
737
738    #[test]
739    fn same_secret_same_sink_remains_stable_across_calls() {
740        // Re-running the SAME finding (same secret, same sink, same file)
741        // must still produce the same fingerprint — that is the entire
742        // point of cross-run dedup. The v2 change adds inputs but does not
743        // introduce non-determinism.
744        let mut graph = AuthorityGraph::new(source(".github/workflows/ci.yml"));
745        let secret = graph.add_node(NodeKind::Secret, "DEPLOY_TOKEN", TrustZone::FirstParty);
746        let step = graph.add_node(NodeKind::Step, "deploy[0]", TrustZone::Untrusted);
747        let f = make_finding(
748            FindingCategory::AuthorityPropagation,
749            "msg",
750            vec![secret, step],
751        );
752        assert_eq!(
753            compute_fingerprint(&f, &graph),
754            compute_fingerprint(&f, &graph)
755        );
756    }
757
758    #[test]
759    fn r2_attack2_two_files_same_secret_name_distinct_fingerprints() {
760        // R2 attack #2 reproducer: two genuinely different findings in two
761        // different pipeline files that share a secret NAME must produce
762        // different fingerprints. The earlier (pre-v0.9.1) algorithm could
763        // collide here; the v2 algorithm explicitly includes file path so
764        // the names cannot alias across files.
765        let mut g_a = AuthorityGraph::new(source("workflows/a.yml"));
766        let mut g_b = AuthorityGraph::new(source("workflows/b.yml"));
767        let s_a = g_a.add_node(NodeKind::Secret, "MY_SECRET", TrustZone::FirstParty);
768        let sink_a = g_a.add_node(NodeKind::Step, "evil/action", TrustZone::Untrusted);
769        let s_b = g_b.add_node(NodeKind::Secret, "MY_SECRET", TrustZone::FirstParty);
770        let sink_b = g_b.add_node(
771            NodeKind::Step,
772            "different-evil/action",
773            TrustZone::Untrusted,
774        );
775
776        let f_a = make_finding(
777            FindingCategory::AuthorityPropagation,
778            "MY_SECRET reaches evil/action",
779            vec![s_a, sink_a],
780        );
781        let f_b = make_finding(
782            FindingCategory::AuthorityPropagation,
783            "MY_SECRET reaches different-evil/action",
784            vec![s_b, sink_b],
785        );
786        assert_ne!(
787            compute_fingerprint(&f_a, &g_a),
788            compute_fingerprint(&f_b, &g_b),
789            "two genuinely different findings must not share a fingerprint \
790             just because the secret name overlaps"
791        );
792    }
793
794    #[test]
795    fn root_authority_segment_is_always_present_even_when_empty() {
796        // Findings without any Secret/Identity (e.g. floating_image) MUST
797        // still produce a stable fingerprint. The empty-root case is its
798        // own equivalence class — two such findings with the same node
799        // list collapse to the same fingerprint; differing node lists
800        // produce different fingerprints.
801        let mut g = AuthorityGraph::new(source(".github/workflows/ci.yml"));
802        let img_a = g.add_node(NodeKind::Image, "alpine:latest", TrustZone::ThirdParty);
803        let img_b = g.add_node(NodeKind::Image, "ubuntu:22.04", TrustZone::ThirdParty);
804        let f_a = make_finding(FindingCategory::FloatingImage, "msg-a", vec![img_a]);
805        let f_b = make_finding(FindingCategory::FloatingImage, "msg-b", vec![img_b]);
806        let fp_a = compute_fingerprint(&f_a, &g);
807        let fp_b = compute_fingerprint(&f_b, &g);
808        assert_ne!(
809            fp_a, fp_b,
810            "two distinct floating-image findings must not collide"
811        );
812        assert_eq!(fp_a.len(), 16);
813        assert_eq!(fp_b.len(), 16);
814    }
815
816    #[test]
817    fn node_order_is_significant() {
818        // The fingerprint preserves caller order in nodes_involved. A
819        // finding emitted as [secret, step] is semantically different from
820        // [step, secret] (source vs sink role) and produces a different
821        // fingerprint. Rules must therefore stay consistent in the order
822        // they push nodes — every built-in does today.
823        let mut g = AuthorityGraph::new(source(".github/workflows/ci.yml"));
824        let s = g.add_node(NodeKind::Secret, "K", TrustZone::FirstParty);
825        let step = g.add_node(NodeKind::Step, "use", TrustZone::Untrusted);
826        let forward = make_finding(FindingCategory::AuthorityPropagation, "x", vec![s, step]);
827        let reverse = make_finding(FindingCategory::AuthorityPropagation, "x", vec![step, s]);
828        assert_ne!(
829            compute_fingerprint(&forward, &g),
830            compute_fingerprint(&reverse, &g),
831            "node order must influence the fingerprint so role swap is detectable"
832        );
833    }
834
835    #[test]
836    fn custom_rule_id_in_message_is_used() {
837        // Custom rules carry id in `[id] message` prefix; fingerprint
838        // must key on the custom id, not the category fallback.
839        let mut graph = AuthorityGraph::new(source(".github/workflows/ci.yml"));
840        let s = graph.add_node(NodeKind::Secret, "X", TrustZone::FirstParty);
841        let f_custom = make_finding(
842            FindingCategory::UnpinnedAction,
843            "[my_custom_rule] something happened",
844            vec![s],
845        );
846        let f_plain = make_finding(FindingCategory::UnpinnedAction, "no prefix here", vec![s]);
847        assert_ne!(
848            compute_fingerprint(&f_custom, &graph),
849            compute_fingerprint(&f_plain, &graph),
850            "custom rule id must distinguish from category fallback"
851        );
852    }
853
854    #[test]
855    fn finding_group_id_is_deterministic_uuid_v5() {
856        // Same fingerprint -> same group id, byte-identical.
857        let g1 = compute_finding_group_id("5edb30f4db3b5fa3");
858        let g2 = compute_finding_group_id("5edb30f4db3b5fa3");
859        assert_eq!(g1, g2);
860        // UUID v5 shape: 8-4-4-4-12 hex chars with version=5 nibble.
861        assert_eq!(g1.len(), 36);
862        // Position 14 is the version nibble — must be '5' for v5.
863        assert_eq!(
864            g1.chars().nth(14),
865            Some('5'),
866            "expected v5 marker, got {g1}"
867        );
868        // Position 19 is the variant nibble — must be one of 8/9/a/b.
869        let variant = g1.chars().nth(19).unwrap();
870        assert!(
871            matches!(variant, '8' | '9' | 'a' | 'b'),
872            "expected RFC 4122 variant, got {variant}"
873        );
874        // Different fingerprint -> different group id.
875        assert_ne!(g1, compute_finding_group_id("a3c8d9e1f2b4c5d6"));
876    }
877
878    #[test]
879    fn with_time_to_fix_attaches_effort() {
880        let mut graph = AuthorityGraph::new(source(".github/workflows/ci.yml"));
881        let s = graph.add_node(NodeKind::Secret, "X", TrustZone::FirstParty);
882        let f = make_finding(FindingCategory::UnpinnedAction, "msg", vec![s])
883            .with_time_to_fix(FixEffort::Trivial);
884        assert_eq!(f.extras.time_to_fix, Some(FixEffort::Trivial));
885    }
886
887    #[test]
888    fn with_compensating_control_downgrades_and_records_original() {
889        let mut graph = AuthorityGraph::new(source(".github/workflows/ci.yml"));
890        let s = graph.add_node(NodeKind::Secret, "X", TrustZone::FirstParty);
891        let f = make_finding(FindingCategory::TriggerContextMismatch, "msg", vec![s])
892            .with_compensating_control("fork check present");
893        // Default High in make_finding -> downgraded to Medium.
894        assert_eq!(f.severity, Severity::Medium);
895        assert_eq!(f.extras.original_severity, Some(Severity::High));
896        assert_eq!(f.extras.compensating_controls.len(), 1);
897    }
898
899    #[test]
900    fn empty_node_list_still_produces_fingerprint() {
901        // Categories like authority_cycle, floating_image, unpinned_action
902        // may not carry an authority node — fingerprint must still work.
903        let graph = AuthorityGraph::new(source(".github/workflows/ci.yml"));
904        let f = make_finding(FindingCategory::UnpinnedAction, "no nodes here", vec![]);
905        let fp = compute_fingerprint(&f, &graph);
906        assert_eq!(fp.len(), 16);
907        assert!(fp.chars().all(|c| c.is_ascii_hexdigit()));
908    }
909}
910
911#[cfg(test)]
912mod source_tests {
913    use super::*;
914
915    #[test]
916    fn built_in_serializes_as_string() {
917        let s = FindingSource::BuiltIn;
918        let v = serde_json::to_value(&s).unwrap();
919        assert_eq!(v, serde_json::json!("built-in"));
920    }
921
922    #[test]
923    fn custom_serializes_with_path_payload() {
924        let s = FindingSource::Custom {
925            source_file: PathBuf::from("/policies/no_prod_pat.yml"),
926        };
927        let v = serde_json::to_value(&s).unwrap();
928        assert_eq!(
929            v,
930            serde_json::json!({"custom": {"source_file": "/policies/no_prod_pat.yml"}})
931        );
932    }
933
934    #[test]
935    fn finding_round_trip_preserves_built_in_source() {
936        let f = Finding {
937            severity: Severity::High,
938            category: FindingCategory::AuthorityPropagation,
939            path: None,
940            nodes_involved: vec![],
941            message: "x".into(),
942            recommendation: Recommendation::Manual {
943                action: "fix".into(),
944            },
945            source: FindingSource::BuiltIn,
946            extras: FindingExtras::default(),
947        };
948        let s = serde_json::to_string(&f).unwrap();
949        // Encoded as the literal `"source":"built-in"` — operators eyeballing
950        // raw JSON immediately see "this is a shipped rule".
951        assert!(
952            s.contains("\"source\":\"built-in\""),
953            "built-in source must serialise as \"built-in\": {s}"
954        );
955        let f2: Finding = serde_json::from_str(&s).unwrap();
956        assert_eq!(f2.source, FindingSource::BuiltIn);
957    }
958
959    #[test]
960    fn finding_round_trip_preserves_custom_source_with_path() {
961        let path = PathBuf::from("/work/invariants/no_prod_pat.yml");
962        let f = Finding {
963            severity: Severity::Critical,
964            category: FindingCategory::AuthorityPropagation,
965            path: None,
966            nodes_involved: vec![],
967            message: "[no_prod_pat] hit".into(),
968            recommendation: Recommendation::Manual {
969                action: "fix".into(),
970            },
971            source: FindingSource::Custom {
972                source_file: path.clone(),
973            },
974            extras: FindingExtras::default(),
975        };
976        let s = serde_json::to_string(&f).unwrap();
977        assert!(
978            s.contains("\"custom\""),
979            "custom source must serialise with `custom` key: {s}"
980        );
981        assert!(
982            s.contains("/work/invariants/no_prod_pat.yml"),
983            "custom source must include the loader path: {s}"
984        );
985        let f2: Finding = serde_json::from_str(&s).unwrap();
986        assert_eq!(
987            f2.source,
988            FindingSource::Custom { source_file: path },
989            "round-trip must preserve custom source path"
990        );
991    }
992
993    #[test]
994    fn missing_source_field_deserializes_as_built_in() {
995        // Backward-compat: pre-provenance JSON omits the field entirely; the
996        // serde default makes it `BuiltIn`. Without this, every old
997        // suppression DB would fail to parse on upgrade.
998        let json = r#"{
999            "severity": "high",
1000            "category": "authority_propagation",
1001            "nodes_involved": [],
1002            "message": "old-format finding",
1003            "recommendation": {"type": "manual", "action": "review"}
1004        }"#;
1005        let f: Finding = serde_json::from_str(json).expect("legacy JSON must parse");
1006        assert_eq!(f.source, FindingSource::BuiltIn);
1007    }
1008}
taudit_core/finding.rs

taudit_core/
finding.rs