taudit_core/graph.rs
1use serde::{Deserialize, Serialize, Serializer};
2use std::collections::{BTreeMap, HashMap};
3
4/// Unique identifier for a node in the authority graph.
5pub type NodeId = usize;
6
7/// Unique identifier for an edge in the authority graph.
8pub type EdgeId = usize;
9
10// ── Metadata key constants ─────────────────────────────
11// Avoids stringly-typed bugs across crate boundaries.
12
13pub const META_DIGEST: &str = "digest";
14pub const META_PERMISSIONS: &str = "permissions";
15pub const META_IDENTITY_SCOPE: &str = "identity_scope";
16pub const META_INFERRED: &str = "inferred";
17/// Marks an Image node as a job container (not a `uses:` action).
18pub const META_CONTAINER: &str = "container";
19/// Marks an Identity node as OIDC-capable (`permissions: id-token: write`).
20pub const META_OIDC: &str = "oidc";
21/// Marks a Secret node whose value is interpolated into a CLI flag argument (e.g. `-var "key=$(SECRET)"`).
22/// CLI flag values appear in pipeline log output even when ADO secret masking is active,
23/// because the command string is logged before masking runs and Terraform itself logs `-var` values.
24pub const META_CLI_FLAG_EXPOSED: &str = "cli_flag_exposed";
25/// Graph-level metadata: identifies the trigger type (e.g. `pull_request_target`, `pr`).
26pub const META_TRIGGER: &str = "trigger";
27/// Marks a Step that writes to the environment gate (`$GITHUB_ENV`, ADO `##vso[task.setvariable]`).
28pub const META_WRITES_ENV_GATE: &str = "writes_env_gate";
29/// Marks a Step that writes a `$(secretRef)` value to the env gate. Co-set with
30/// META_WRITES_ENV_GATE when the written VALUE contains an ADO `$(VAR)` expression,
31/// distinguishing secret-exfiltration from plain-integer or literal env-gate writes.
32pub const META_ENV_GATE_WRITES_SECRET_VALUE: &str = "env_gate_writes_secret_value";
33/// Marks a Step that came from an ADO `##vso[task.setvariable]` call (as opposed to
34/// a GHA `>> $GITHUB_ENV` redirect). Used to distinguish the two env-gate write
35/// patterns so BUG-4 suppression only applies to ADO plain-value writes.
36pub const META_SETVARIABLE_ADO: &str = "setvariable_ado";
37/// Marks a Step that reads from the runner-managed environment via an
38/// `env.<NAME>` template reference — `${{ env.X }}` in a `with:` value,
39/// inline script body, or step `env:` mapping. Distinct from `secrets.X`
40/// references (which produce a HasAccessTo edge to a Secret node) — `env.X`
41/// references can be sourced from the ambient runner environment, including
42/// values laundered through `$GITHUB_ENV` by an earlier step. Stamped by
43/// the GHA parser so `secret_via_env_gate_to_untrusted_consumer` can find
44/// the gate-laundering chain that the explicit-secret rules miss.
45pub const META_READS_ENV: &str = "reads_env";
46/// Marks a Step that performs cryptographic provenance attestation (e.g. `actions/attest-build-provenance`).
47pub const META_ATTESTS: &str = "attests";
48/// Marks a Secret node sourced from an ADO variable group (vs inline pipeline variable).
49pub const META_VARIABLE_GROUP: &str = "variable_group";
50/// Marks an Image node as a self-hosted agent pool (pool.name on ADO; runs-on: self-hosted on GHA).
51pub const META_SELF_HOSTED: &str = "self_hosted";
52/// Marks a Step that performs a `checkout: self` (ADO) or default `actions/checkout` on a PR context.
53pub const META_CHECKOUT_SELF: &str = "checkout_self";
54/// Marks an Identity node as an ADO service connection.
55pub const META_SERVICE_CONNECTION: &str = "service_connection";
56/// Marks an Identity node as implicitly injected by the platform (e.g. ADO System.AccessToken).
57/// Implicit tokens are structurally accessible to all tasks by platform design — exposure
58/// to untrusted steps is Info-level (structural) rather than Critical (misconfiguration).
59pub const META_IMPLICIT: &str = "implicit";
60/// Marks a Step that belongs to an ADO deployment job whose `environment:` is
61/// configured with required approvals — a manual gate that breaks automatic
62/// authority propagation. Findings whose path crosses such a node have their
63/// severity reduced by one step (Critical → High → Medium → Low).
64pub const META_ENV_APPROVAL: &str = "env_approval";
65/// Records the parent job name on every Step node, enabling per-job subgraph
66/// filtering (e.g. `taudit map --job build`) and downstream consumers that
67/// need to attribute steps back to their containing job. Set by both the GHA
68/// and ADO parsers on every Step they create within a job's scope.
69pub const META_JOB_NAME: &str = "job_name";
70/// Graph-level metadata: JSON-encoded array of `resources.repositories[]`
71/// entries declared by the pipeline. Each entry is an object with fields
72/// `alias`, `repo_type`, `name`, optional `ref`, and `used` (true when the
73/// alias is referenced via `template: x@alias`, `extends: x@alias`, or
74/// `checkout: alias` somewhere in the same pipeline file). Set by the ADO
75/// parser; consumed by `template_extends_unpinned_branch`.
76pub const META_REPOSITORIES: &str = "repositories";
77/// Records the raw inline script body of a Step (the text from
78/// `script:` / `bash:` / `powershell:` / `pwsh:` / `run:` / task
79/// `inputs.script` / `inputs.Inline` / `inputs.inlineScript`). Stamped by
80/// parsers when the step has an inline script. Consumed by script-aware
81/// rules: `vm_remote_exec_via_pipeline_secret`,
82/// `short_lived_sas_in_command_line`, `secret_to_inline_script_env_export`,
83/// `secret_materialised_to_workspace_file`, `keyvault_secret_to_plaintext`,
84/// `add_spn_with_inline_script`, `parameter_interpolation_into_shell`.
85/// Stored verbatim — rules apply their own pattern matching.
86pub const META_SCRIPT_BODY: &str = "script_body";
87/// Records the name of the ADO service connection a step uses (the value of
88/// `inputs.azureSubscription` / `inputs.connectedServiceName*`). Set on the
89/// Step node itself (in addition to the Identity node it links to) so rules
90/// can pattern-match on the connection name without traversing edges.
91pub const META_SERVICE_CONNECTION_NAME: &str = "service_connection_name";
92/// Marks a Step as performing `terraform apply ... -auto-approve` (either via
93/// an inline script or via a `TerraformCLI` / `TerraformTask` task with
94/// `command: apply` and `commandOptions` containing `auto-approve`).
95pub const META_TERRAFORM_AUTO_APPROVE: &str = "terraform_auto_approve";
96/// Marks a Step task that runs with `addSpnToEnvironment: true`, exposing
97/// the federated SPN (idToken / servicePrincipalKey / servicePrincipalId /
98/// tenantId) to the inline script body via environment variables.
99pub const META_ADD_SPN_TO_ENV: &str = "add_spn_to_environment";
100/// Graph-level metadata: identifies the source platform of the parsed
101/// pipeline. Set by every parser to its `platform()` value
102/// (`"github-actions"`, `"azure-devops"`, `"gitlab"`). Allows platform-scoped
103/// rules to gate their detection without parsing the source file path.
104pub const META_PLATFORM: &str = "platform";
105/// Graph-level metadata: marks a GitHub Actions workflow as having NO
106/// top-level `permissions:` block declared. Set by the GHA parser when
107/// `workflow.permissions` is absent so rules can detect the negative-space
108/// "no permissions block at all" pattern (which leaves `GITHUB_TOKEN` at its
109/// broad platform default — `contents: write`, `packages: write`, etc.).
110pub const META_NO_WORKFLOW_PERMISSIONS: &str = "no_workflow_permissions";
111/// Marks a Step in a GHA workflow as carrying an `if:` condition that
112/// references the standard fork-check pattern
113/// (`github.event.pull_request.head.repo.fork == false` or the equivalent
114/// `head.repo.full_name == github.repository`). Stamped by the GHA parser so
115/// rules can credit the step with the compensating control without
116/// re-parsing the YAML expression. Bool stored as `"true"`.
117pub const META_FORK_CHECK: &str = "fork_check";
118/// Marks a GitLab CI job (Step node) whose `rules:` or `only:` clause
119/// restricts execution to protected branches — either via an explicit
120/// `if: $CI_COMMIT_REF_PROTECTED == "true"` rule, an `if: $CI_COMMIT_BRANCH
121/// == $CI_DEFAULT_BRANCH` rule, or an `only: [main, ...]` allowlist of
122/// platform-protected refs. Set by the GitLab parser. Absence on a
123/// deployment job is a control gap.
124pub const META_RULES_PROTECTED_ONLY: &str = "rules_protected_only";
125/// Graph-level metadata: comma-joined list of every entry under `on:` (e.g.
126/// `pull_request_target,issue_comment,workflow_run`). Distinct from
127/// `META_TRIGGER` (singular) which is set only for `pull_request_target` /
128/// ADO `pr` to preserve the existing `trigger_context_mismatch` contract.
129/// Consumers of this list (e.g. `risky_trigger_with_authority`) must split on
130/// `,` and treat each token as a trigger name.
131pub const META_TRIGGERS: &str = "triggers";
132/// Graph-level metadata: comma-joined list of `workflow_dispatch.inputs.*`
133/// names declared by the workflow. Empty / absent if the workflow has no
134/// `workflow_dispatch` trigger. Consumed by
135/// `manual_dispatch_input_to_url_or_command` to taint-track input flow into
136/// command lines, URLs, and `actions/checkout` refs.
137pub const META_DISPATCH_INPUTS: &str = "dispatch_inputs";
138/// Graph-level metadata: pipe-delimited list of `<job>\t<name>\t<source>`
139/// records, one per `jobs.<id>.outputs.<name>`. Records are joined with `|`,
140/// fields within a record with `\t`. `source` is one of `secret` (value
141/// reads `secrets.*`), `oidc` (value references `steps.*.outputs.*` from a
142/// step that holds an OIDC identity), `step_output` (any other
143/// `steps.*.outputs.*`), or `literal`. Plain-text rather than JSON to keep
144/// the parser crate free of `serde_json`. Consumed by
145/// `sensitive_value_in_job_output`.
146pub const META_JOB_OUTPUTS: &str = "job_outputs";
147/// Step-level metadata: the value passed to `actions/checkout`'s `with.ref`
148/// input (verbatim, including any `${{ … }}` expressions). Stamped only on
149/// `actions/checkout` steps that supply a `ref:`. Consumed by
150/// `manual_dispatch_input_to_url_or_command`.
151pub const META_CHECKOUT_REF: &str = "checkout_ref";
152/// Marks the synthetic Step node created for a job that delegates to a
153/// reusable workflow with `secrets: inherit`. The whole secret bag forwards
154/// to the callee regardless of what the callee actually consumes — when the
155/// caller is fired by an attacker-controllable trigger this is a wide-open
156/// exfiltration path. Set on the synthetic step node by the GHA parser.
157pub const META_SECRETS_INHERIT: &str = "secrets_inherit";
158/// Marks a Step that downloads a workflow artifact (typically
159/// `actions/download-artifact` or `dawidd6/action-download-artifact`).
160/// In `workflow_run`-triggered consumers, the originating run's artifacts
161/// were produced from PR context — the consumer must treat their content as
162/// untrusted input even when the consumer itself runs with elevated perms.
163pub const META_DOWNLOADS_ARTIFACT: &str = "downloads_artifact";
164/// Marks a Step whose body interprets artifact (or other untrusted file)
165/// content into a privileged sink — `unzip`/`tar -x`, `cat`/`jq` piping
166/// into `>> $GITHUB_ENV`/`>> $GITHUB_OUTPUT`, `eval`, posting to a PR
167/// comment via `actions/github-script` `body:`/`issue_body:`, or evaluating
168/// extracted text. Combined with `META_DOWNLOADS_ARTIFACT` upstream in the
169/// same job and a `workflow_run`/`pull_request_target` trigger this is the
170/// classic mypy_primer / coverage-comment artifact-RCE pattern.
171pub const META_INTERPRETS_ARTIFACT: &str = "interprets_artifact";
172/// Marks a Step that uses an interactive debug action (mxschmitt/action-tmate,
173/// lhotari/action-upterm, actions/tmate, etc.). The cell value is the action
174/// reference (e.g. `mxschmitt/action-tmate@v3`). A successful debug session
175/// gives the operator an external SSH endpoint with the runner's full
176/// environment loaded — every secret in scope, the checked-out HEAD, and
177/// write access to whatever the GITHUB_TOKEN holds.
178pub const META_INTERACTIVE_DEBUG: &str = "interactive_debug";
179/// Marks a Step that calls `actions/cache` (or `actions/cache/save` /
180/// `actions/cache/restore`). The cell value is the raw `key:` input from
181/// the step's `with:` block. Consumed by `pr_specific_cache_key_in_default_branch_consumer`
182/// to detect PR-derived cache keys (head_ref, head.ref, actor) that a
183/// default-branch run can later restore — classic cache poisoning.
184pub const META_CACHE_KEY: &str = "cache_key";
185/// Records the OIDC audience (`aud:`) value of an `id_tokens:` entry on an
186/// Identity node. GitLab CI emits one Identity per `id_tokens:` key; the
187/// audience is what trades for downstream cloud creds (Vault path, AWS role,
188/// etc), so audience reuse across MR-context and protected-context jobs is
189/// the precise privilege-overscope signal. Set by the GitLab parser.
190pub const META_OIDC_AUDIENCE: &str = "oidc_audience";
191/// Records a Step's `environment:url:` value verbatim. Stamped by the GitLab
192/// parser when the job declares an `environment:` mapping with a `url:`
193/// field. Consumed by `untrusted_ci_var_in_shell_interpolation` because
194/// `environment:url:` is rendered by the GitLab UI and any predefined-CI-var
195/// interpolated into it is a stored-XSS / open-redirect sink.
196pub const META_ENVIRONMENT_URL: &str = "environment_url";
197/// Graph-level metadata: JSON-encoded array of `include:` entries declared by
198/// a GitLab CI pipeline. Each entry is an object with fields:
199/// - `kind`: one of `local`, `remote`, `template`, `project`, `component`
200/// - `target`: the path/URL/project string
201/// - `git_ref`: the resolved `ref:` value (only meaningful for `project` and
202/// `remote`) — empty string when the include omits a `ref:`
203///
204/// Set by the GitLab parser; consumed by `unpinned_include_remote_or_branch_ref`.
205pub const META_GITLAB_INCLUDES: &str = "gitlab_includes";
206/// Marks a Step (GitLab job) that declares one or more `services:` entries
207/// matching `docker:*-dind` or `docker:dind`. Combined with secret-bearing
208/// HasAccessTo edges it indicates a runtime sandbox-escape primitive — any
209/// inline build step can `docker run -v /:/host` from inside dind.
210pub const META_GITLAB_DIND_SERVICE: &str = "gitlab_dind_service";
211/// Marks a Step (GitLab job) declared with `allow_failure: true`. Used by
212/// `security_job_silently_skipped` to detect scanner jobs that pass silently.
213pub const META_GITLAB_ALLOW_FAILURE: &str = "gitlab_allow_failure";
214/// Records the comma-joined list of `extends:` template names a GitLab job
215/// inherits from. Used by scanner-name pattern matching in
216/// `security_job_silently_skipped` because GitLab security templates are
217/// usually consumed via `extends:` rather than by job-name match.
218pub const META_GITLAB_EXTENDS: &str = "gitlab_extends";
219/// Marks a Step (GitLab job) that defines a `trigger:` block (downstream /
220/// child pipeline). Value is `"static"` for a fixed downstream `project:` or
221/// `include:` of in-tree YAML, and `"dynamic"` when the include source is an
222/// `artifact:` (dynamic child pipelines — code-injection sink).
223pub const META_GITLAB_TRIGGER_KIND: &str = "gitlab_trigger_kind";
224/// Records the literal `cache.key:` value declared on a GitLab job (or the
225/// empty string if no cache is declared). Consumed by
226/// `cache_key_crosses_trust_boundary` to detect cross-trust cache keys.
227pub const META_GITLAB_CACHE_KEY: &str = "gitlab_cache_key";
228/// Records the `cache.policy:` value declared on a GitLab job
229/// (`pull` / `push` / `pull-push` / `pull_push`). When absent, the GitLab
230/// runtime default is `pull-push`. Consumed by
231/// `cache_key_crosses_trust_boundary`.
232pub const META_GITLAB_CACHE_POLICY: &str = "gitlab_cache_policy";
233/// Records the deployment environment name on a Step
234/// (e.g. GitLab `environment.name:` / GHA `environment:`).
235/// Used by rules that gate on production-like environment names.
236pub const META_ENVIRONMENT_NAME: &str = "environment_name";
237/// Records the GitLab `artifacts.reports.dotenv:` file path for a Step.
238/// When set, the file's `KEY=value` lines are silently exported as
239/// pipeline variables for every downstream job that consumes this job
240/// via `needs:` or `dependencies:`. Consumed by
241/// `dotenv_artifact_flows_to_privileged_deployment`.
242pub const META_DOTENV_FILE: &str = "dotenv_file";
243/// Records, on a Step, the upstream job names this step consumes via
244/// GitLab `needs:` or `dependencies:`. Comma-separated job names.
245/// Used to build dotenv-flow dependency chains across stages.
246pub const META_NEEDS: &str = "needs";
247/// Marks an Image node (self-hosted agent pool) as having workspace isolation
248/// configured (`workspace: { clean: all }` or `workspace: { clean: true }` in
249/// ADO). When present, the agent workspace is wiped between runs, mitigating
250/// workspace poisoning attacks where a PR build leaves malicious files for the
251/// next privileged pipeline run. Absence of this key on a self-hosted Image
252/// node is the signal for `shared_self_hosted_pool_no_isolation`.
253pub const META_WORKSPACE_CLEAN: &str = "workspace_clean";
254
255// ── Shared helpers ─────────────────────────────────────
256
257/// Serialize a `HashMap<String, V>` with keys in sorted order. The
258/// in-memory representation stays a `HashMap` (cheaper insertion, hot
259/// path on every parser); only the serialized form is canonicalised.
260/// This is the single point of determinism control for graph metadata
261/// emitted via JSON / SARIF / CloudEvents — without it, HashMap iteration
262/// order leaks per-process randomness into every diff and cache key.
263fn serialize_string_map_sorted<S, V>(
264 map: &HashMap<String, V>,
265 serializer: S,
266) -> Result<S::Ok, S::Error>
267where
268 S: Serializer,
269 V: Serialize,
270{
271 let sorted: BTreeMap<&String, &V> = map.iter().collect();
272 sorted.serialize(serializer)
273}
274
275/// Returns true if `ref_str` is a SHA-pinned action reference.
276/// Checks: contains `@`, part after `@` is >= 40 hex chars.
277/// Single source of truth — used by both parser and rules.
278///
279/// This is a *structural* check — it accepts any 40+ hex character suffix
280/// without verifying the SHA refers to a real commit. For a semantic check
281/// that rejects obviously-bogus values like all-zero, see
282/// [`is_pin_semantically_valid`].
283pub fn is_sha_pinned(ref_str: &str) -> bool {
284 ref_str.contains('@')
285 && ref_str
286 .split('@')
287 .next_back()
288 .map(|s| s.len() >= 40 && s.chars().all(|c| c.is_ascii_hexdigit()))
289 .unwrap_or(false)
290}
291
292/// Returns true if `image` is pinned to a Docker digest.
293/// Docker digest format: `image@sha256:<64-hex-chars-lowercase>`.
294///
295/// Truncated digests (e.g. `alpine@sha256:abc`) and uppercase hex are
296/// rejected — Docker requires the full 64-character lowercase hex form.
297pub fn is_docker_digest_pinned(image: &str) -> bool {
298 image.contains("@sha256:")
299 && image
300 .split("@sha256:")
301 .nth(1)
302 .map(|h| {
303 h.len() == 64
304 && h.chars()
305 .all(|c| c.is_ascii_digit() || ('a'..='f').contains(&c))
306 })
307 .unwrap_or(false)
308}
309
310/// Returns true if `ref_str` looks both structurally pinned AND semantically
311/// plausible. Layered on top of [`is_sha_pinned`] / [`is_docker_digest_pinned`]:
312/// a structurally valid pin can still be obviously bogus (e.g. an all-zero SHA
313/// is syntactically a 40-char hex string but does not refer to any real
314/// commit; an attacker could use it to fake a "pinned" appearance).
315///
316/// Rules that want to flag impersonation attempts (rather than just laziness)
317/// should call this in addition to / instead of the structural check.
318///
319/// Rejects:
320/// - All-zero SHA-1 references (`actions/foo@0000…0000`).
321/// - All-zero sha256 docker digests (`image@sha256:0000…0000`).
322///
323/// Anything else that passes the structural check passes here.
324pub fn is_pin_semantically_valid(ref_str: &str) -> bool {
325 // Docker digest form takes priority (the `@sha256:` prefix is unambiguous).
326 if ref_str.contains("@sha256:") {
327 if !is_docker_digest_pinned(ref_str) {
328 return false;
329 }
330 let digest = ref_str.split("@sha256:").nth(1).unwrap_or("");
331 return !digest.chars().all(|c| c == '0');
332 }
333
334 if !is_sha_pinned(ref_str) {
335 return false;
336 }
337 let sha = ref_str.split('@').next_back().unwrap_or("");
338 !sha.chars().all(|c| c == '0')
339}
340
341// ── Graph-level precision markers ───────────────────────
342
343/// The category of reason why a graph is partial.
344#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
345#[serde(rename_all = "snake_case")]
346pub enum GapKind {
347 /// A template or matrix expression hides a value; graph structure is intact.
348 Expression,
349 /// An unresolvable component (composite action, reusable workflow, extends,
350 /// include) breaks the authority chain.
351 Structural,
352 /// The graph cannot be built at all (zero steps produced, unknown platform).
353 Opaque,
354}
355
356/// How complete is this authority graph? Parsers set this based on whether
357/// they could fully resolve all authority relationships in the pipeline YAML.
358///
359/// A `Partial` graph is still useful — it just tells the consumer that some
360/// authority paths may be missing. This is better than silent incompleteness.
361#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
362#[serde(rename_all = "snake_case")]
363pub enum AuthorityCompleteness {
364 /// Parser resolved all authority relationships.
365 Complete,
366 /// Parser found constructs it couldn't fully resolve (e.g. secrets in
367 /// shell strings, composite actions, reusable workflows). The graph
368 /// captures what it can, but edges may be missing.
369 Partial,
370 /// Parser couldn't determine completeness.
371 Unknown,
372}
373
374/// How broad is an identity's scope? Classifies the risk surface of tokens,
375/// service principals, and OIDC identities.
376#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
377#[serde(rename_all = "snake_case")]
378pub enum IdentityScope {
379 /// Wide permissions: write-all, admin, or unscoped tokens.
380 Broad,
381 /// Narrow permissions: contents:read, specific scopes.
382 Constrained,
383 /// Scope couldn't be determined — treat as risky.
384 Unknown,
385}
386
387impl IdentityScope {
388 /// Classify an identity scope from a permissions string.
389 pub fn from_permissions(perms: &str) -> Self {
390 let p = perms.to_lowercase();
391 if p.contains("write-all") || p.contains("admin") || p == "{}" || p.is_empty() {
392 IdentityScope::Broad
393 } else if p.contains("write") {
394 // Any write permission = broad (conservative)
395 IdentityScope::Broad
396 } else if p.contains("read") {
397 IdentityScope::Constrained
398 } else {
399 IdentityScope::Unknown
400 }
401 }
402}
403
404// ── Node types ──────────────────────────────────────────
405
406/// Semantic kind of a graph node.
407#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
408#[serde(rename_all = "snake_case")]
409pub enum NodeKind {
410 Step,
411 Secret,
412 Artifact,
413 Identity,
414 Image,
415}
416
417/// Trust classification. Explicit on every node — not inferred from kind.
418#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
419#[serde(rename_all = "snake_case")]
420pub enum TrustZone {
421 /// Code/config authored by the repo owner.
422 FirstParty,
423 /// Marketplace actions, external images (pinned).
424 ThirdParty,
425 /// Unpinned actions, fork PRs, user input.
426 Untrusted,
427}
428
429impl TrustZone {
430 /// Returns true if `self` is a lower trust level than `other`.
431 pub fn is_lower_than(&self, other: &TrustZone) -> bool {
432 self.rank() < other.rank()
433 }
434
435 fn rank(&self) -> u8 {
436 match self {
437 TrustZone::FirstParty => 2,
438 TrustZone::ThirdParty => 1,
439 TrustZone::Untrusted => 0,
440 }
441 }
442}
443
444/// A node in the authority graph.
445#[derive(Debug, Clone, Serialize, Deserialize)]
446pub struct Node {
447 pub id: NodeId,
448 pub kind: NodeKind,
449 pub name: String,
450 pub trust_zone: TrustZone,
451 /// Flexible metadata: pinning status, digest, scope, permissions, etc.
452 /// Serialized in sorted-key order so JSON / SARIF / CloudEvents output
453 /// is byte-deterministic across runs (HashMap iteration is randomised
454 /// per process, which would otherwise break diffs and cache keys).
455 #[serde(serialize_with = "serialize_string_map_sorted")]
456 pub metadata: HashMap<String, String>,
457}
458
459// ── Edge types ──────────────────────────────────────────
460
461/// Edge semantics model authority/data flow — not syntactic YAML relations.
462/// Design test: "Can authority propagate along this edge?"
463#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
464#[serde(rename_all = "snake_case")]
465pub enum EdgeKind {
466 /// Step -> Secret or Identity (authority granted at runtime).
467 HasAccessTo,
468 /// Step -> Artifact (data flows out).
469 Produces,
470 /// Artifact -> Step (authority flows from artifact to consuming step).
471 Consumes,
472 /// Step -> Image/Action (execution delegation).
473 UsesImage,
474 /// Step -> Step (cross-job or action boundary).
475 DelegatesTo,
476 /// Step -> Secret or Identity (credential written to disk, outliving the step's lifetime).
477 /// Distinct from HasAccessTo: disk persistence is accessible to all subsequent steps
478 /// and processes with filesystem access, not just the step that created it.
479 PersistsTo,
480}
481
482/// Abbreviated authority context for **`HasAccessTo` → identity** edges in
483/// JSON exports (ADR 0002 Phase 2). Copied from the target identity’s trust
484/// zone and selected `metadata` keys so consumers need not reverse-engineer
485/// raw `META_*` strings for common questions. Omitted on edges where absent.
486#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
487pub struct AuthorityEdgeSummary {
488 /// Target identity trust zone (`first_party` / `third_party` / `untrusted`).
489 #[serde(default, skip_serializing_if = "Option::is_none")]
490 pub trust_zone: Option<String>,
491 /// Copy of `identity_scope` metadata when present.
492 #[serde(default, skip_serializing_if = "Option::is_none")]
493 pub identity_scope: Option<String>,
494 /// Copy of `permissions` metadata when present, truncated for bounded JSON.
495 #[serde(default, skip_serializing_if = "Option::is_none")]
496 pub permissions_summary: Option<String>,
497}
498
499/// Maximum characters per summary string field on [`AuthorityEdgeSummary`].
500pub const AUTHORITY_EDGE_SUMMARY_FIELD_MAX: usize = 192;
501
502fn truncate_edge_summary_field(s: &str) -> String {
503 let max = AUTHORITY_EDGE_SUMMARY_FIELD_MAX;
504 let n = s.chars().count();
505 if n <= max {
506 s.to_string()
507 } else {
508 let mut out: String = s.chars().take(max.saturating_sub(1)).collect();
509 out.push('…');
510 out
511 }
512}
513
514fn trust_zone_snake_case(zone: TrustZone) -> String {
515 match zone {
516 TrustZone::FirstParty => "first_party".into(),
517 TrustZone::ThirdParty => "third_party".into(),
518 TrustZone::Untrusted => "untrusted".into(),
519 }
520}
521
522/// A directed edge in the authority graph.
523#[derive(Debug, Clone, Serialize, Deserialize)]
524pub struct Edge {
525 pub id: EdgeId,
526 pub from: NodeId,
527 pub to: NodeId,
528 pub kind: EdgeKind,
529 /// Present on `has_access_to` edges whose target is an identity node.
530 #[serde(default, skip_serializing_if = "Option::is_none")]
531 pub authority_summary: Option<AuthorityEdgeSummary>,
532}
533
534// ── Pipeline source ─────────────────────────────────────
535
536/// Where the pipeline definition came from.
537#[derive(Debug, Clone, Serialize, Deserialize)]
538pub struct PipelineSource {
539 pub file: String,
540 #[serde(skip_serializing_if = "Option::is_none")]
541 pub repo: Option<String>,
542 #[serde(skip_serializing_if = "Option::is_none")]
543 pub git_ref: Option<String>,
544 /// SHA of the commit being analyzed; reproducibility hint when set.
545 /// Parsers leave None; CI integrations populate this from the build env.
546 #[serde(default, skip_serializing_if = "Option::is_none")]
547 pub commit_sha: Option<String>,
548}
549
550// ── The graph ───────────────────────────────────────────
551
552/// Pipeline-level parameter declaration captured from a top-level
553/// `parameters:` block. Used by rules that need to reason about whether
554/// caller-supplied parameter values are constrained (`values:` allowlist)
555/// or free-form (no allowlist on a string parameter — shell-injection risk).
556#[derive(Debug, Clone, Serialize, Deserialize)]
557pub struct ParamSpec {
558 /// Declared parameter type (`string`, `number`, `boolean`, `object`, etc.).
559 /// Empty string when the YAML omitted `type:` (ADO defaults to string).
560 pub param_type: String,
561 /// True when the parameter declares a `values:` allowlist that constrains
562 /// the set of acceptable inputs. When true, free-form shell injection is
563 /// not possible because the runtime rejects any value outside the list.
564 pub has_values_allowlist: bool,
565}
566
567/// Directed authority graph. Nodes are pipeline elements (steps, secrets,
568/// artifacts, identities, images). Edges model authority/data flow.
569#[derive(Debug, Clone, Serialize, Deserialize)]
570pub struct AuthorityGraph {
571 pub source: PipelineSource,
572 pub nodes: Vec<Node>,
573 pub edges: Vec<Edge>,
574 /// How complete is this graph? Set by the parser based on what it could resolve.
575 pub completeness: AuthorityCompleteness,
576 /// Human-readable reasons why the graph is Partial (if applicable).
577 #[serde(default, skip_serializing_if = "Vec::is_empty")]
578 pub completeness_gaps: Vec<String>,
579 /// Typed categories for each completeness gap (parallel to `completeness_gaps`).
580 #[serde(default, skip_serializing_if = "Vec::is_empty")]
581 pub completeness_gap_kinds: Vec<GapKind>,
582 /// Graph-level metadata set by parsers (e.g. trigger type, platform-specific flags).
583 /// Serialized in sorted-key order — see `Node.metadata` rationale.
584 #[serde(
585 default,
586 skip_serializing_if = "HashMap::is_empty",
587 serialize_with = "serialize_string_map_sorted"
588 )]
589 pub metadata: HashMap<String, String>,
590 /// Top-level pipeline `parameters:` declarations, keyed by parameter name.
591 /// Populated by parsers that surface parameter metadata (currently ADO).
592 /// Empty for platforms / pipelines that don't declare parameters.
593 /// Serialized in sorted-key order — see `Node.metadata` rationale.
594 #[serde(
595 default,
596 skip_serializing_if = "HashMap::is_empty",
597 serialize_with = "serialize_string_map_sorted"
598 )]
599 pub parameters: HashMap<String, ParamSpec>,
600}
601
602impl AuthorityGraph {
603 pub fn new(source: PipelineSource) -> Self {
604 Self {
605 source,
606 nodes: Vec::new(),
607 edges: Vec::new(),
608 completeness: AuthorityCompleteness::Complete,
609 completeness_gaps: Vec::new(),
610 completeness_gap_kinds: Vec::new(),
611 metadata: HashMap::new(),
612 parameters: HashMap::new(),
613 }
614 }
615
616 /// Mark the graph as partially complete with a reason.
617 pub fn mark_partial(&mut self, kind: GapKind, reason: impl Into<String>) {
618 self.completeness = AuthorityCompleteness::Partial;
619 self.completeness_gaps.push(reason.into());
620 self.completeness_gap_kinds.push(kind);
621 }
622
623 /// Returns the most severe GapKind present, or None if the graph is complete/unknown.
624 pub fn worst_gap_kind(&self) -> Option<GapKind> {
625 self.completeness_gap_kinds
626 .iter()
627 .max_by_key(|k| match k {
628 GapKind::Expression => 0u8,
629 GapKind::Structural => 1,
630 GapKind::Opaque => 2,
631 })
632 .copied()
633 }
634
635 /// Add a node, returns its ID.
636 pub fn add_node(
637 &mut self,
638 kind: NodeKind,
639 name: impl Into<String>,
640 trust_zone: TrustZone,
641 ) -> NodeId {
642 let id = self.nodes.len();
643 self.nodes.push(Node {
644 id,
645 kind,
646 name: name.into(),
647 trust_zone,
648 metadata: HashMap::new(),
649 });
650 id
651 }
652
653 /// Add a node with metadata, returns its ID.
654 pub fn add_node_with_metadata(
655 &mut self,
656 kind: NodeKind,
657 name: impl Into<String>,
658 trust_zone: TrustZone,
659 metadata: HashMap<String, String>,
660 ) -> NodeId {
661 let id = self.nodes.len();
662 self.nodes.push(Node {
663 id,
664 kind,
665 name: name.into(),
666 trust_zone,
667 metadata,
668 });
669 id
670 }
671
672 /// Add a directed edge, returns its ID.
673 pub fn add_edge(&mut self, from: NodeId, to: NodeId, kind: EdgeKind) -> EdgeId {
674 let id = self.edges.len();
675 self.edges.push(Edge {
676 id,
677 from,
678 to,
679 kind,
680 authority_summary: None,
681 });
682 id
683 }
684
685 /// Populate [`Edge::authority_summary`] for each **`HasAccessTo`** edge whose
686 /// target is an **identity** node, from that node’s trust zone and
687 /// allowlisted metadata (`identity_scope`, `permissions`). Idempotent.
688 ///
689 /// Called automatically at the end of every built-in [`crate::ports::PipelineParser`]
690 /// implementation so `taudit graph --format json` and scan JSON include summaries.
691 pub fn stamp_edge_authority_summaries(&mut self) {
692 for edge in &mut self.edges {
693 if edge.kind != EdgeKind::HasAccessTo {
694 continue;
695 }
696 let Some(to_node) = self.nodes.get(edge.to) else {
697 continue;
698 };
699 if to_node.kind != NodeKind::Identity {
700 continue;
701 }
702 edge.authority_summary = Some(AuthorityEdgeSummary {
703 trust_zone: Some(trust_zone_snake_case(to_node.trust_zone)),
704 identity_scope: to_node
705 .metadata
706 .get(META_IDENTITY_SCOPE)
707 .map(|s| truncate_edge_summary_field(s)),
708 permissions_summary: to_node
709 .metadata
710 .get(META_PERMISSIONS)
711 .map(|s| truncate_edge_summary_field(s)),
712 });
713 }
714 }
715
716 /// Outgoing edges from a node.
717 pub fn edges_from(&self, id: NodeId) -> impl Iterator<Item = &Edge> {
718 self.edges.iter().filter(move |e| e.from == id)
719 }
720
721 /// Incoming edges to a node.
722 pub fn edges_to(&self, id: NodeId) -> impl Iterator<Item = &Edge> {
723 self.edges.iter().filter(move |e| e.to == id)
724 }
725
726 /// All authority-bearing source nodes (Secret + Identity).
727 /// These are the BFS start set for propagation analysis.
728 pub fn authority_sources(&self) -> impl Iterator<Item = &Node> {
729 self.nodes
730 .iter()
731 .filter(|n| matches!(n.kind, NodeKind::Secret | NodeKind::Identity))
732 }
733
734 /// All nodes of a given kind.
735 pub fn nodes_of_kind(&self, kind: NodeKind) -> impl Iterator<Item = &Node> {
736 self.nodes.iter().filter(move |n| n.kind == kind)
737 }
738
739 /// All nodes in a given trust zone.
740 pub fn nodes_in_zone(&self, zone: TrustZone) -> impl Iterator<Item = &Node> {
741 self.nodes.iter().filter(move |n| n.trust_zone == zone)
742 }
743
744 /// Get a node by ID.
745 pub fn node(&self, id: NodeId) -> Option<&Node> {
746 self.nodes.get(id)
747 }
748
749 /// Get an edge by ID.
750 pub fn edge(&self, id: EdgeId) -> Option<&Edge> {
751 self.edges.get(id)
752 }
753}
754
755#[cfg(test)]
756mod tests {
757 use super::*;
758
759 #[test]
760 fn build_simple_graph() {
761 let mut g = AuthorityGraph::new(PipelineSource {
762 file: "deploy.yml".into(),
763 repo: None,
764 git_ref: None,
765 commit_sha: None,
766 });
767
768 let secret = g.add_node(NodeKind::Secret, "AWS_KEY", TrustZone::FirstParty);
769 let step_build = g.add_node(NodeKind::Step, "build", TrustZone::FirstParty);
770 let artifact = g.add_node(NodeKind::Artifact, "dist.tar.gz", TrustZone::FirstParty);
771 let step_deploy = g.add_node(NodeKind::Step, "deploy", TrustZone::ThirdParty);
772
773 g.add_edge(step_build, secret, EdgeKind::HasAccessTo);
774 g.add_edge(step_build, artifact, EdgeKind::Produces);
775 g.add_edge(artifact, step_deploy, EdgeKind::Consumes);
776
777 assert_eq!(g.nodes.len(), 4);
778 assert_eq!(g.edges.len(), 3);
779 assert_eq!(g.authority_sources().count(), 1);
780 assert_eq!(g.edges_from(step_build).count(), 2);
781 assert_eq!(g.edges_from(artifact).count(), 1); // Consumes flows artifact -> step
782 }
783
784 #[test]
785 fn stamp_edge_authority_summaries_on_has_access_to_identity() {
786 let mut g = AuthorityGraph::new(PipelineSource {
787 file: "ci.yml".into(),
788 repo: None,
789 git_ref: None,
790 commit_sha: None,
791 });
792 let secret = g.add_node(NodeKind::Secret, "K", TrustZone::FirstParty);
793 let mut id_meta = HashMap::new();
794 id_meta.insert(META_IDENTITY_SCOPE.into(), "constrained".into());
795 id_meta.insert(META_PERMISSIONS.into(), "read-all".into());
796 let ident = g.add_node_with_metadata(
797 NodeKind::Identity,
798 "GITHUB_TOKEN",
799 TrustZone::FirstParty,
800 id_meta,
801 );
802 let step = g.add_node(NodeKind::Step, "s", TrustZone::FirstParty);
803 let e_secret = g.add_edge(step, secret, EdgeKind::HasAccessTo);
804 let e_ident = g.add_edge(step, ident, EdgeKind::HasAccessTo);
805
806 g.stamp_edge_authority_summaries();
807
808 assert!(g.edges[e_secret].authority_summary.is_none());
809 let sum = g.edges[e_ident]
810 .authority_summary
811 .as_ref()
812 .expect("identity edge summary");
813 assert_eq!(sum.trust_zone.as_deref(), Some("first_party"));
814 assert_eq!(sum.identity_scope.as_deref(), Some("constrained"));
815 assert_eq!(sum.permissions_summary.as_deref(), Some("read-all"));
816 }
817
818 #[test]
819 fn completeness_default_is_complete() {
820 let g = AuthorityGraph::new(PipelineSource {
821 file: "test.yml".into(),
822 repo: None,
823 git_ref: None,
824 commit_sha: None,
825 });
826 assert_eq!(g.completeness, AuthorityCompleteness::Complete);
827 assert!(g.completeness_gaps.is_empty());
828 }
829
830 #[test]
831 fn mark_partial_records_reason() {
832 let mut g = AuthorityGraph::new(PipelineSource {
833 file: "test.yml".into(),
834 repo: None,
835 git_ref: None,
836 commit_sha: None,
837 });
838 g.mark_partial(
839 GapKind::Expression,
840 "secrets in run: block inferred, not precisely mapped",
841 );
842 assert_eq!(g.completeness, AuthorityCompleteness::Partial);
843 assert_eq!(g.completeness_gaps.len(), 1);
844 assert_eq!(g.completeness_gap_kinds.len(), 1);
845 }
846
847 #[test]
848 fn identity_scope_from_permissions() {
849 assert_eq!(
850 IdentityScope::from_permissions("write-all"),
851 IdentityScope::Broad
852 );
853 assert_eq!(
854 IdentityScope::from_permissions("{ contents: write }"),
855 IdentityScope::Broad
856 );
857 assert_eq!(
858 IdentityScope::from_permissions("{ contents: read }"),
859 IdentityScope::Constrained
860 );
861 assert_eq!(
862 IdentityScope::from_permissions("{ id-token: write }"),
863 IdentityScope::Broad
864 );
865 assert_eq!(IdentityScope::from_permissions(""), IdentityScope::Broad);
866 assert_eq!(
867 IdentityScope::from_permissions("custom-scope"),
868 IdentityScope::Unknown
869 );
870 }
871
872 #[test]
873 fn trust_zone_ordering() {
874 assert!(TrustZone::Untrusted.is_lower_than(&TrustZone::FirstParty));
875 assert!(TrustZone::ThirdParty.is_lower_than(&TrustZone::FirstParty));
876 assert!(TrustZone::Untrusted.is_lower_than(&TrustZone::ThirdParty));
877 assert!(!TrustZone::FirstParty.is_lower_than(&TrustZone::FirstParty));
878 }
879
880 // ── Pin validation (fuzz B3 regression) ─────────────────
881
882 #[test]
883 fn is_sha_pinned_accepts_lowercase_40_hex() {
884 // 40 lowercase hex — the canonical legitimate form.
885 assert!(is_sha_pinned(
886 "actions/checkout@abc1234567890abcdef1234567890abcdef123456"
887 ));
888 // Mixed case is still structurally pinned (legitimate — Git accepts both).
889 assert!(is_sha_pinned(
890 "actions/checkout@ABCDEF1234567890abcdef1234567890ABCDEF12"
891 ));
892 }
893
894 #[test]
895 fn is_sha_pinned_structural_accepts_all_zero() {
896 // Structural check is intentionally permissive — semantic rejection
897 // happens in is_pin_semantically_valid. Documented in B3.
898 assert!(is_sha_pinned(
899 "actions/setup-python@0000000000000000000000000000000000000000"
900 ));
901 }
902
903 #[test]
904 fn is_sha_pinned_rejects_short_or_non_hex() {
905 assert!(!is_sha_pinned("actions/checkout@v4"));
906 assert!(!is_sha_pinned("actions/setup-node@a1b2c3"));
907 // 60 chars but not all hex.
908 assert!(!is_sha_pinned(
909 "actions/checkout@somethingthatlookslikeashabutisntsha1234567890abcdef"
910 ));
911 }
912
913 #[test]
914 fn is_pin_semantically_valid_rejects_all_zero_sha() {
915 // Fuzz B3 reproducer.
916 assert!(!is_pin_semantically_valid(
917 "actions/setup-python@0000000000000000000000000000000000000000"
918 ));
919 }
920
921 #[test]
922 fn is_pin_semantically_valid_accepts_real_looking_sha() {
923 assert!(is_pin_semantically_valid(
924 "actions/checkout@abc1234567890abcdef1234567890abcdef123456"
925 ));
926 }
927
928 #[test]
929 fn is_pin_semantically_valid_rejects_unpinned() {
930 assert!(!is_pin_semantically_valid("actions/checkout@v4"));
931 assert!(!is_pin_semantically_valid("actions/setup-node@a1b2c3"));
932 }
933
934 #[test]
935 fn is_docker_digest_pinned_rejects_truncated() {
936 // Fuzz B3 reproducer: previously accepted, now rejected.
937 assert!(!is_docker_digest_pinned("alpine@sha256:abc"));
938 // 65 chars (one too long).
939 assert!(!is_docker_digest_pinned(
940 "alpine@sha256:abc123def456abc123def456abc123def456abc123def456abc123def456abcde"
941 ));
942 // 63 chars (one short).
943 assert!(!is_docker_digest_pinned(
944 "alpine@sha256:abc123def456abc123def456abc123def456abc123def456abc123def456abc"
945 ));
946 }
947
948 #[test]
949 fn is_docker_digest_pinned_accepts_full_64_lowercase() {
950 // Exactly 64 lowercase hex chars after `@sha256:`.
951 assert!(is_docker_digest_pinned(
952 "alpine@sha256:abc123def456abc123def456abc123def456abc123def456abc123def456abcd"
953 ));
954 }
955
956 #[test]
957 fn is_docker_digest_pinned_rejects_uppercase() {
958 // Docker requires lowercase — uppercase indicates a hand-crafted /
959 // tampered string and should not pass.
960 assert!(!is_docker_digest_pinned(
961 "alpine@sha256:ABC123DEF456ABC123DEF456ABC123DEF456ABC123DEF456ABC123DEF456ABCD"
962 ));
963 }
964
965 #[test]
966 fn is_pin_semantically_valid_rejects_all_zero_docker_digest() {
967 assert!(!is_pin_semantically_valid(
968 "alpine@sha256:0000000000000000000000000000000000000000000000000000000000000000"
969 ));
970 }
971
972 #[test]
973 fn is_pin_semantically_valid_accepts_real_docker_digest() {
974 assert!(is_pin_semantically_valid(
975 "alpine@sha256:abc123def456abc123def456abc123def456abc123def456abc123def456abcd"
976 ));
977 }
978}