Skip to main content

taudit_core/
graph.rs

1use serde::{Deserialize, Serialize, Serializer};
2use std::collections::{BTreeMap, HashMap};
3
4/// Unique identifier for a node in the authority graph.
5pub type NodeId = usize;
6
7/// Unique identifier for an edge in the authority graph.
8pub type EdgeId = usize;
9
10// ── Metadata key constants ─────────────────────────────
11// Avoids stringly-typed bugs across crate boundaries.
12
13pub const META_DIGEST: &str = "digest";
14pub const META_PERMISSIONS: &str = "permissions";
15pub const META_IDENTITY_SCOPE: &str = "identity_scope";
16pub const META_INFERRED: &str = "inferred";
17/// Marks an Image node as a job container (not a `uses:` action).
18pub const META_CONTAINER: &str = "container";
19/// Marks an Identity node as OIDC-capable (`permissions: id-token: write`).
20pub const META_OIDC: &str = "oidc";
21/// Marks a Secret node whose value is interpolated into a CLI flag argument (e.g. `-var "key=$(SECRET)"`).
22/// CLI flag values appear in pipeline log output even when ADO secret masking is active,
23/// because the command string is logged before masking runs and Terraform itself logs `-var` values.
24pub const META_CLI_FLAG_EXPOSED: &str = "cli_flag_exposed";
25/// Graph-level metadata: identifies the trigger type (e.g. `pull_request_target`, `pr`).
26pub const META_TRIGGER: &str = "trigger";
27/// Marks a Step that writes to the environment gate (`$GITHUB_ENV`, ADO `##vso[task.setvariable]`).
28pub const META_WRITES_ENV_GATE: &str = "writes_env_gate";
29/// Marks a Step that reads from the runner-managed environment via an
30/// `env.<NAME>` template reference — `${{ env.X }}` in a `with:` value,
31/// inline script body, or step `env:` mapping. Distinct from `secrets.X`
32/// references (which produce a HasAccessTo edge to a Secret node) — `env.X`
33/// references can be sourced from the ambient runner environment, including
34/// values laundered through `$GITHUB_ENV` by an earlier step. Stamped by
35/// the GHA parser so `secret_via_env_gate_to_untrusted_consumer` can find
36/// the gate-laundering chain that the explicit-secret rules miss.
37pub const META_READS_ENV: &str = "reads_env";
38/// Marks a Step that performs cryptographic provenance attestation (e.g. `actions/attest-build-provenance`).
39pub const META_ATTESTS: &str = "attests";
40/// Marks a Secret node sourced from an ADO variable group (vs inline pipeline variable).
41pub const META_VARIABLE_GROUP: &str = "variable_group";
42/// Marks an Image node as a self-hosted agent pool (pool.name on ADO; runs-on: self-hosted on GHA).
43pub const META_SELF_HOSTED: &str = "self_hosted";
44/// Marks a Step that performs a `checkout: self` (ADO) or default `actions/checkout` on a PR context.
45pub const META_CHECKOUT_SELF: &str = "checkout_self";
46/// Marks an Identity node as an ADO service connection.
47pub const META_SERVICE_CONNECTION: &str = "service_connection";
48/// Marks an Identity node as implicitly injected by the platform (e.g. ADO System.AccessToken).
49/// Implicit tokens are structurally accessible to all tasks by platform design — exposure
50/// to untrusted steps is Info-level (structural) rather than Critical (misconfiguration).
51pub const META_IMPLICIT: &str = "implicit";
52/// Marks a Step that belongs to an ADO deployment job whose `environment:` is
53/// configured with required approvals — a manual gate that breaks automatic
54/// authority propagation. Findings whose path crosses such a node have their
55/// severity reduced by one step (Critical → High → Medium → Low).
56pub const META_ENV_APPROVAL: &str = "env_approval";
57/// Records the parent job name on every Step node, enabling per-job subgraph
58/// filtering (e.g. `taudit map --job build`) and downstream consumers that
59/// need to attribute steps back to their containing job. Set by both the GHA
60/// and ADO parsers on every Step they create within a job's scope.
61pub const META_JOB_NAME: &str = "job_name";
62/// Graph-level metadata: JSON-encoded array of `resources.repositories[]`
63/// entries declared by the pipeline. Each entry is an object with fields
64/// `alias`, `repo_type`, `name`, optional `ref`, and `used` (true when the
65/// alias is referenced via `template: x@alias`, `extends: x@alias`, or
66/// `checkout: alias` somewhere in the same pipeline file). Set by the ADO
67/// parser; consumed by `template_extends_unpinned_branch`.
68pub const META_REPOSITORIES: &str = "repositories";
69/// Records the raw inline script body of a Step (the text from
70/// `script:` / `bash:` / `powershell:` / `pwsh:` / `run:` / task
71/// `inputs.script` / `inputs.Inline` / `inputs.inlineScript`). Stamped by
72/// parsers when the step has an inline script. Consumed by script-aware
73/// rules: `vm_remote_exec_via_pipeline_secret`,
74/// `short_lived_sas_in_command_line`, `secret_to_inline_script_env_export`,
75/// `secret_materialised_to_workspace_file`, `keyvault_secret_to_plaintext`,
76/// `add_spn_with_inline_script`, `parameter_interpolation_into_shell`.
77/// Stored verbatim — rules apply their own pattern matching.
78pub const META_SCRIPT_BODY: &str = "script_body";
79/// Records the name of the ADO service connection a step uses (the value of
80/// `inputs.azureSubscription` / `inputs.connectedServiceName*`). Set on the
81/// Step node itself (in addition to the Identity node it links to) so rules
82/// can pattern-match on the connection name without traversing edges.
83pub const META_SERVICE_CONNECTION_NAME: &str = "service_connection_name";
84/// Marks a Step as performing `terraform apply ... -auto-approve` (either via
85/// an inline script or via a `TerraformCLI` / `TerraformTask` task with
86/// `command: apply` and `commandOptions` containing `auto-approve`).
87pub const META_TERRAFORM_AUTO_APPROVE: &str = "terraform_auto_approve";
88/// Marks a Step task that runs with `addSpnToEnvironment: true`, exposing
89/// the federated SPN (idToken / servicePrincipalKey / servicePrincipalId /
90/// tenantId) to the inline script body via environment variables.
91pub const META_ADD_SPN_TO_ENV: &str = "add_spn_to_environment";
92/// Graph-level metadata: identifies the source platform of the parsed
93/// pipeline. Set by every parser to its `platform()` value
94/// (`"github-actions"`, `"azure-devops"`, `"gitlab"`). Allows platform-scoped
95/// rules to gate their detection without parsing the source file path.
96pub const META_PLATFORM: &str = "platform";
97/// Graph-level metadata: marks a GitHub Actions workflow as having NO
98/// top-level `permissions:` block declared. Set by the GHA parser when
99/// `workflow.permissions` is absent so rules can detect the negative-space
100/// "no permissions block at all" pattern (which leaves `GITHUB_TOKEN` at its
101/// broad platform default — `contents: write`, `packages: write`, etc.).
102pub const META_NO_WORKFLOW_PERMISSIONS: &str = "no_workflow_permissions";
103/// Marks a Step in a GHA workflow as carrying an `if:` condition that
104/// references the standard fork-check pattern
105/// (`github.event.pull_request.head.repo.fork == false` or the equivalent
106/// `head.repo.full_name == github.repository`). Stamped by the GHA parser so
107/// rules can credit the step with the compensating control without
108/// re-parsing the YAML expression. Bool stored as `"true"`.
109pub const META_FORK_CHECK: &str = "fork_check";
110/// Marks a GitLab CI job (Step node) whose `rules:` or `only:` clause
111/// restricts execution to protected branches — either via an explicit
112/// `if: $CI_COMMIT_REF_PROTECTED == "true"` rule, an `if: $CI_COMMIT_BRANCH
113/// == $CI_DEFAULT_BRANCH` rule, or an `only: [main, ...]` allowlist of
114/// platform-protected refs. Set by the GitLab parser. Absence on a
115/// deployment job is a control gap.
116pub const META_RULES_PROTECTED_ONLY: &str = "rules_protected_only";
117/// Graph-level metadata: comma-joined list of every entry under `on:` (e.g.
118/// `pull_request_target,issue_comment,workflow_run`). Distinct from
119/// `META_TRIGGER` (singular) which is set only for `pull_request_target` /
120/// ADO `pr` to preserve the existing `trigger_context_mismatch` contract.
121/// Consumers of this list (e.g. `risky_trigger_with_authority`) must split on
122/// `,` and treat each token as a trigger name.
123pub const META_TRIGGERS: &str = "triggers";
124/// Graph-level metadata: comma-joined list of `workflow_dispatch.inputs.*`
125/// names declared by the workflow. Empty / absent if the workflow has no
126/// `workflow_dispatch` trigger. Consumed by
127/// `manual_dispatch_input_to_url_or_command` to taint-track input flow into
128/// command lines, URLs, and `actions/checkout` refs.
129pub const META_DISPATCH_INPUTS: &str = "dispatch_inputs";
130/// Graph-level metadata: pipe-delimited list of `<job>\t<name>\t<source>`
131/// records, one per `jobs.<id>.outputs.<name>`. Records are joined with `|`,
132/// fields within a record with `\t`. `source` is one of `secret` (value
133/// reads `secrets.*`), `oidc` (value references `steps.*.outputs.*` from a
134/// step that holds an OIDC identity), `step_output` (any other
135/// `steps.*.outputs.*`), or `literal`. Plain-text rather than JSON to keep
136/// the parser crate free of `serde_json`. Consumed by
137/// `sensitive_value_in_job_output`.
138pub const META_JOB_OUTPUTS: &str = "job_outputs";
139/// Step-level metadata: the value passed to `actions/checkout`'s `with.ref`
140/// input (verbatim, including any `${{ … }}` expressions). Stamped only on
141/// `actions/checkout` steps that supply a `ref:`. Consumed by
142/// `manual_dispatch_input_to_url_or_command`.
143pub const META_CHECKOUT_REF: &str = "checkout_ref";
144/// Marks the synthetic Step node created for a job that delegates to a
145/// reusable workflow with `secrets: inherit`. The whole secret bag forwards
146/// to the callee regardless of what the callee actually consumes — when the
147/// caller is fired by an attacker-controllable trigger this is a wide-open
148/// exfiltration path. Set on the synthetic step node by the GHA parser.
149pub const META_SECRETS_INHERIT: &str = "secrets_inherit";
150/// Marks a Step that downloads a workflow artifact (typically
151/// `actions/download-artifact` or `dawidd6/action-download-artifact`).
152/// In `workflow_run`-triggered consumers, the originating run's artifacts
153/// were produced from PR context — the consumer must treat their content as
154/// untrusted input even when the consumer itself runs with elevated perms.
155pub const META_DOWNLOADS_ARTIFACT: &str = "downloads_artifact";
156/// Marks a Step whose body interprets artifact (or other untrusted file)
157/// content into a privileged sink — `unzip`/`tar -x`, `cat`/`jq` piping
158/// into `>> $GITHUB_ENV`/`>> $GITHUB_OUTPUT`, `eval`, posting to a PR
159/// comment via `actions/github-script` `body:`/`issue_body:`, or evaluating
160/// extracted text. Combined with `META_DOWNLOADS_ARTIFACT` upstream in the
161/// same job and a `workflow_run`/`pull_request_target` trigger this is the
162/// classic mypy_primer / coverage-comment artifact-RCE pattern.
163pub const META_INTERPRETS_ARTIFACT: &str = "interprets_artifact";
164/// Marks a Step that uses an interactive debug action (mxschmitt/action-tmate,
165/// lhotari/action-upterm, actions/tmate, etc.). The cell value is the action
166/// reference (e.g. `mxschmitt/action-tmate@v3`). A successful debug session
167/// gives the operator an external SSH endpoint with the runner's full
168/// environment loaded — every secret in scope, the checked-out HEAD, and
169/// write access to whatever the GITHUB_TOKEN holds.
170pub const META_INTERACTIVE_DEBUG: &str = "interactive_debug";
171/// Marks a Step that calls `actions/cache` (or `actions/cache/save` /
172/// `actions/cache/restore`). The cell value is the raw `key:` input from
173/// the step's `with:` block. Consumed by `pr_specific_cache_key_in_default_branch_consumer`
174/// to detect PR-derived cache keys (head_ref, head.ref, actor) that a
175/// default-branch run can later restore — classic cache poisoning.
176pub const META_CACHE_KEY: &str = "cache_key";
177/// Records the OIDC audience (`aud:`) value of an `id_tokens:` entry on an
178/// Identity node. GitLab CI emits one Identity per `id_tokens:` key; the
179/// audience is what trades for downstream cloud creds (Vault path, AWS role,
180/// etc), so audience reuse across MR-context and protected-context jobs is
181/// the precise privilege-overscope signal. Set by the GitLab parser.
182pub const META_OIDC_AUDIENCE: &str = "oidc_audience";
183/// Records a Step's `environment:url:` value verbatim. Stamped by the GitLab
184/// parser when the job declares an `environment:` mapping with a `url:`
185/// field. Consumed by `untrusted_ci_var_in_shell_interpolation` because
186/// `environment:url:` is rendered by the GitLab UI and any predefined-CI-var
187/// interpolated into it is a stored-XSS / open-redirect sink.
188pub const META_ENVIRONMENT_URL: &str = "environment_url";
189/// Graph-level metadata: JSON-encoded array of `include:` entries declared by
190/// a GitLab CI pipeline. Each entry is an object with fields:
191/// - `kind`: one of `local`, `remote`, `template`, `project`, `component`
192/// - `target`: the path/URL/project string
193/// - `git_ref`: the resolved `ref:` value (only meaningful for `project` and
194///   `remote`) — empty string when the include omits a `ref:`
195///
196/// Set by the GitLab parser; consumed by `unpinned_include_remote_or_branch_ref`.
197pub const META_GITLAB_INCLUDES: &str = "gitlab_includes";
198/// Marks a Step (GitLab job) that declares one or more `services:` entries
199/// matching `docker:*-dind` or `docker:dind`. Combined with secret-bearing
200/// HasAccessTo edges it indicates a runtime sandbox-escape primitive — any
201/// inline build step can `docker run -v /:/host` from inside dind.
202pub const META_GITLAB_DIND_SERVICE: &str = "gitlab_dind_service";
203/// Marks a Step (GitLab job) declared with `allow_failure: true`. Used by
204/// `security_job_silently_skipped` to detect scanner jobs that pass silently.
205pub const META_GITLAB_ALLOW_FAILURE: &str = "gitlab_allow_failure";
206/// Records the comma-joined list of `extends:` template names a GitLab job
207/// inherits from. Used by scanner-name pattern matching in
208/// `security_job_silently_skipped` because GitLab security templates are
209/// usually consumed via `extends:` rather than by job-name match.
210pub const META_GITLAB_EXTENDS: &str = "gitlab_extends";
211/// Marks a Step (GitLab job) that defines a `trigger:` block (downstream /
212/// child pipeline). Value is `"static"` for a fixed downstream `project:` or
213/// `include:` of in-tree YAML, and `"dynamic"` when the include source is an
214/// `artifact:` (dynamic child pipelines — code-injection sink).
215pub const META_GITLAB_TRIGGER_KIND: &str = "gitlab_trigger_kind";
216/// Records the literal `cache.key:` value declared on a GitLab job (or the
217/// empty string if no cache is declared). Consumed by
218/// `cache_key_crosses_trust_boundary` to detect cross-trust cache keys.
219pub const META_GITLAB_CACHE_KEY: &str = "gitlab_cache_key";
220/// Records the `cache.policy:` value declared on a GitLab job
221/// (`pull` / `push` / `pull-push` / `pull_push`). When absent, the GitLab
222/// runtime default is `pull-push`. Consumed by
223/// `cache_key_crosses_trust_boundary`.
224pub const META_GITLAB_CACHE_POLICY: &str = "gitlab_cache_policy";
225/// Records the deployment environment name on a Step
226/// (e.g. GitLab `environment.name:` / GHA `environment:`).
227/// Used by rules that gate on production-like environment names.
228pub const META_ENVIRONMENT_NAME: &str = "environment_name";
229/// Records the GitLab `artifacts.reports.dotenv:` file path for a Step.
230/// When set, the file's `KEY=value` lines are silently exported as
231/// pipeline variables for every downstream job that consumes this job
232/// via `needs:` or `dependencies:`. Consumed by
233/// `dotenv_artifact_flows_to_privileged_deployment`.
234pub const META_DOTENV_FILE: &str = "dotenv_file";
235/// Records, on a Step, the upstream job names this step consumes via
236/// GitLab `needs:` or `dependencies:`. Comma-separated job names.
237/// Used to build dotenv-flow dependency chains across stages.
238pub const META_NEEDS: &str = "needs";
239/// Marks an Image node (self-hosted agent pool) as having workspace isolation
240/// configured (`workspace: { clean: all }` or `workspace: { clean: true }` in
241/// ADO). When present, the agent workspace is wiped between runs, mitigating
242/// workspace poisoning attacks where a PR build leaves malicious files for the
243/// next privileged pipeline run. Absence of this key on a self-hosted Image
244/// node is the signal for `shared_self_hosted_pool_no_isolation`.
245pub const META_WORKSPACE_CLEAN: &str = "workspace_clean";
246
247// ── Shared helpers ─────────────────────────────────────
248
249/// Serialize a `HashMap<String, V>` with keys in sorted order. The
250/// in-memory representation stays a `HashMap` (cheaper insertion, hot
251/// path on every parser); only the serialized form is canonicalised.
252/// This is the single point of determinism control for graph metadata
253/// emitted via JSON / SARIF / CloudEvents — without it, HashMap iteration
254/// order leaks per-process randomness into every diff and cache key.
255fn serialize_string_map_sorted<S, V>(
256    map: &HashMap<String, V>,
257    serializer: S,
258) -> Result<S::Ok, S::Error>
259where
260    S: Serializer,
261    V: Serialize,
262{
263    let sorted: BTreeMap<&String, &V> = map.iter().collect();
264    sorted.serialize(serializer)
265}
266
267/// Returns true if `ref_str` is a SHA-pinned action reference.
268/// Checks: contains `@`, part after `@` is >= 40 hex chars.
269/// Single source of truth — used by both parser and rules.
270///
271/// This is a *structural* check — it accepts any 40+ hex character suffix
272/// without verifying the SHA refers to a real commit. For a semantic check
273/// that rejects obviously-bogus values like all-zero, see
274/// [`is_pin_semantically_valid`].
275pub fn is_sha_pinned(ref_str: &str) -> bool {
276    ref_str.contains('@')
277        && ref_str
278            .split('@')
279            .next_back()
280            .map(|s| s.len() >= 40 && s.chars().all(|c| c.is_ascii_hexdigit()))
281            .unwrap_or(false)
282}
283
284/// Returns true if `image` is pinned to a Docker digest.
285/// Docker digest format: `image@sha256:<64-hex-chars-lowercase>`.
286///
287/// Truncated digests (e.g. `alpine@sha256:abc`) and uppercase hex are
288/// rejected — Docker requires the full 64-character lowercase hex form.
289pub fn is_docker_digest_pinned(image: &str) -> bool {
290    image.contains("@sha256:")
291        && image
292            .split("@sha256:")
293            .nth(1)
294            .map(|h| {
295                h.len() == 64
296                    && h.chars()
297                        .all(|c| c.is_ascii_digit() || ('a'..='f').contains(&c))
298            })
299            .unwrap_or(false)
300}
301
302/// Returns true if `ref_str` looks both structurally pinned AND semantically
303/// plausible. Layered on top of [`is_sha_pinned`] / [`is_docker_digest_pinned`]:
304/// a structurally valid pin can still be obviously bogus (e.g. an all-zero SHA
305/// is syntactically a 40-char hex string but does not refer to any real
306/// commit; an attacker could use it to fake a "pinned" appearance).
307///
308/// Rules that want to flag impersonation attempts (rather than just laziness)
309/// should call this in addition to / instead of the structural check.
310///
311/// Rejects:
312/// - All-zero SHA-1 references (`actions/foo@0000…0000`).
313/// - All-zero sha256 docker digests (`image@sha256:0000…0000`).
314///
315/// Anything else that passes the structural check passes here.
316pub fn is_pin_semantically_valid(ref_str: &str) -> bool {
317    // Docker digest form takes priority (the `@sha256:` prefix is unambiguous).
318    if ref_str.contains("@sha256:") {
319        if !is_docker_digest_pinned(ref_str) {
320            return false;
321        }
322        let digest = ref_str.split("@sha256:").nth(1).unwrap_or("");
323        return !digest.chars().all(|c| c == '0');
324    }
325
326    if !is_sha_pinned(ref_str) {
327        return false;
328    }
329    let sha = ref_str.split('@').next_back().unwrap_or("");
330    !sha.chars().all(|c| c == '0')
331}
332
333// ── Graph-level precision markers ───────────────────────
334
335/// How complete is this authority graph? Parsers set this based on whether
336/// they could fully resolve all authority relationships in the pipeline YAML.
337///
338/// A `Partial` graph is still useful — it just tells the consumer that some
339/// authority paths may be missing. This is better than silent incompleteness.
340#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
341#[serde(rename_all = "snake_case")]
342pub enum AuthorityCompleteness {
343    /// Parser resolved all authority relationships.
344    Complete,
345    /// Parser found constructs it couldn't fully resolve (e.g. secrets in
346    /// shell strings, composite actions, reusable workflows). The graph
347    /// captures what it can, but edges may be missing.
348    Partial,
349    /// Parser couldn't determine completeness.
350    Unknown,
351}
352
353/// How broad is an identity's scope? Classifies the risk surface of tokens,
354/// service principals, and OIDC identities.
355#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
356#[serde(rename_all = "snake_case")]
357pub enum IdentityScope {
358    /// Wide permissions: write-all, admin, or unscoped tokens.
359    Broad,
360    /// Narrow permissions: contents:read, specific scopes.
361    Constrained,
362    /// Scope couldn't be determined — treat as risky.
363    Unknown,
364}
365
366impl IdentityScope {
367    /// Classify an identity scope from a permissions string.
368    pub fn from_permissions(perms: &str) -> Self {
369        let p = perms.to_lowercase();
370        if p.contains("write-all") || p.contains("admin") || p == "{}" || p.is_empty() {
371            IdentityScope::Broad
372        } else if p.contains("write") {
373            // Any write permission = broad (conservative)
374            IdentityScope::Broad
375        } else if p.contains("read") {
376            IdentityScope::Constrained
377        } else {
378            IdentityScope::Unknown
379        }
380    }
381}
382
383// ── Node types ──────────────────────────────────────────
384
385/// Semantic kind of a graph node.
386#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
387#[serde(rename_all = "snake_case")]
388pub enum NodeKind {
389    Step,
390    Secret,
391    Artifact,
392    Identity,
393    Image,
394}
395
396/// Trust classification. Explicit on every node — not inferred from kind.
397#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
398#[serde(rename_all = "snake_case")]
399pub enum TrustZone {
400    /// Code/config authored by the repo owner.
401    FirstParty,
402    /// Marketplace actions, external images (pinned).
403    ThirdParty,
404    /// Unpinned actions, fork PRs, user input.
405    Untrusted,
406}
407
408impl TrustZone {
409    /// Returns true if `self` is a lower trust level than `other`.
410    pub fn is_lower_than(&self, other: &TrustZone) -> bool {
411        self.rank() < other.rank()
412    }
413
414    fn rank(&self) -> u8 {
415        match self {
416            TrustZone::FirstParty => 2,
417            TrustZone::ThirdParty => 1,
418            TrustZone::Untrusted => 0,
419        }
420    }
421}
422
423/// A node in the authority graph.
424#[derive(Debug, Clone, Serialize, Deserialize)]
425pub struct Node {
426    pub id: NodeId,
427    pub kind: NodeKind,
428    pub name: String,
429    pub trust_zone: TrustZone,
430    /// Flexible metadata: pinning status, digest, scope, permissions, etc.
431    /// Serialized in sorted-key order so JSON / SARIF / CloudEvents output
432    /// is byte-deterministic across runs (HashMap iteration is randomised
433    /// per process, which would otherwise break diffs and cache keys).
434    #[serde(serialize_with = "serialize_string_map_sorted")]
435    pub metadata: HashMap<String, String>,
436}
437
438// ── Edge types ──────────────────────────────────────────
439
440/// Edge semantics model authority/data flow — not syntactic YAML relations.
441/// Design test: "Can authority propagate along this edge?"
442#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
443#[serde(rename_all = "snake_case")]
444pub enum EdgeKind {
445    /// Step -> Secret or Identity (authority granted at runtime).
446    HasAccessTo,
447    /// Step -> Artifact (data flows out).
448    Produces,
449    /// Artifact -> Step (authority flows from artifact to consuming step).
450    Consumes,
451    /// Step -> Image/Action (execution delegation).
452    UsesImage,
453    /// Step -> Step (cross-job or action boundary).
454    DelegatesTo,
455    /// Step -> Secret or Identity (credential written to disk, outliving the step's lifetime).
456    /// Distinct from HasAccessTo: disk persistence is accessible to all subsequent steps
457    /// and processes with filesystem access, not just the step that created it.
458    PersistsTo,
459}
460
461/// A directed edge in the authority graph.
462#[derive(Debug, Clone, Serialize, Deserialize)]
463pub struct Edge {
464    pub id: EdgeId,
465    pub from: NodeId,
466    pub to: NodeId,
467    pub kind: EdgeKind,
468}
469
470// ── Pipeline source ─────────────────────────────────────
471
472/// Where the pipeline definition came from.
473#[derive(Debug, Clone, Serialize, Deserialize)]
474pub struct PipelineSource {
475    pub file: String,
476    #[serde(skip_serializing_if = "Option::is_none")]
477    pub repo: Option<String>,
478    #[serde(skip_serializing_if = "Option::is_none")]
479    pub git_ref: Option<String>,
480    /// SHA of the commit being analyzed; reproducibility hint when set.
481    /// Parsers leave None; CI integrations populate this from the build env.
482    #[serde(default, skip_serializing_if = "Option::is_none")]
483    pub commit_sha: Option<String>,
484}
485
486// ── The graph ───────────────────────────────────────────
487
488/// Pipeline-level parameter declaration captured from a top-level
489/// `parameters:` block. Used by rules that need to reason about whether
490/// caller-supplied parameter values are constrained (`values:` allowlist)
491/// or free-form (no allowlist on a string parameter — shell-injection risk).
492#[derive(Debug, Clone, Serialize, Deserialize)]
493pub struct ParamSpec {
494    /// Declared parameter type (`string`, `number`, `boolean`, `object`, etc.).
495    /// Empty string when the YAML omitted `type:` (ADO defaults to string).
496    pub param_type: String,
497    /// True when the parameter declares a `values:` allowlist that constrains
498    /// the set of acceptable inputs. When true, free-form shell injection is
499    /// not possible because the runtime rejects any value outside the list.
500    pub has_values_allowlist: bool,
501}
502
503/// Directed authority graph. Nodes are pipeline elements (steps, secrets,
504/// artifacts, identities, images). Edges model authority/data flow.
505#[derive(Debug, Clone, Serialize, Deserialize)]
506pub struct AuthorityGraph {
507    pub source: PipelineSource,
508    pub nodes: Vec<Node>,
509    pub edges: Vec<Edge>,
510    /// How complete is this graph? Set by the parser based on what it could resolve.
511    pub completeness: AuthorityCompleteness,
512    /// Human-readable reasons why the graph is Partial (if applicable).
513    #[serde(default, skip_serializing_if = "Vec::is_empty")]
514    pub completeness_gaps: Vec<String>,
515    /// Graph-level metadata set by parsers (e.g. trigger type, platform-specific flags).
516    /// Serialized in sorted-key order — see `Node.metadata` rationale.
517    #[serde(
518        default,
519        skip_serializing_if = "HashMap::is_empty",
520        serialize_with = "serialize_string_map_sorted"
521    )]
522    pub metadata: HashMap<String, String>,
523    /// Top-level pipeline `parameters:` declarations, keyed by parameter name.
524    /// Populated by parsers that surface parameter metadata (currently ADO).
525    /// Empty for platforms / pipelines that don't declare parameters.
526    /// Serialized in sorted-key order — see `Node.metadata` rationale.
527    #[serde(
528        default,
529        skip_serializing_if = "HashMap::is_empty",
530        serialize_with = "serialize_string_map_sorted"
531    )]
532    pub parameters: HashMap<String, ParamSpec>,
533}
534
535impl AuthorityGraph {
536    pub fn new(source: PipelineSource) -> Self {
537        Self {
538            source,
539            nodes: Vec::new(),
540            edges: Vec::new(),
541            completeness: AuthorityCompleteness::Complete,
542            completeness_gaps: Vec::new(),
543            metadata: HashMap::new(),
544            parameters: HashMap::new(),
545        }
546    }
547
548    /// Mark the graph as partially complete with a reason.
549    pub fn mark_partial(&mut self, reason: impl Into<String>) {
550        self.completeness = AuthorityCompleteness::Partial;
551        self.completeness_gaps.push(reason.into());
552    }
553
554    /// Add a node, returns its ID.
555    pub fn add_node(
556        &mut self,
557        kind: NodeKind,
558        name: impl Into<String>,
559        trust_zone: TrustZone,
560    ) -> NodeId {
561        let id = self.nodes.len();
562        self.nodes.push(Node {
563            id,
564            kind,
565            name: name.into(),
566            trust_zone,
567            metadata: HashMap::new(),
568        });
569        id
570    }
571
572    /// Add a node with metadata, returns its ID.
573    pub fn add_node_with_metadata(
574        &mut self,
575        kind: NodeKind,
576        name: impl Into<String>,
577        trust_zone: TrustZone,
578        metadata: HashMap<String, String>,
579    ) -> NodeId {
580        let id = self.nodes.len();
581        self.nodes.push(Node {
582            id,
583            kind,
584            name: name.into(),
585            trust_zone,
586            metadata,
587        });
588        id
589    }
590
591    /// Add a directed edge, returns its ID.
592    pub fn add_edge(&mut self, from: NodeId, to: NodeId, kind: EdgeKind) -> EdgeId {
593        let id = self.edges.len();
594        self.edges.push(Edge { id, from, to, kind });
595        id
596    }
597
598    /// Outgoing edges from a node.
599    pub fn edges_from(&self, id: NodeId) -> impl Iterator<Item = &Edge> {
600        self.edges.iter().filter(move |e| e.from == id)
601    }
602
603    /// Incoming edges to a node.
604    pub fn edges_to(&self, id: NodeId) -> impl Iterator<Item = &Edge> {
605        self.edges.iter().filter(move |e| e.to == id)
606    }
607
608    /// All authority-bearing source nodes (Secret + Identity).
609    /// These are the BFS start set for propagation analysis.
610    pub fn authority_sources(&self) -> impl Iterator<Item = &Node> {
611        self.nodes
612            .iter()
613            .filter(|n| matches!(n.kind, NodeKind::Secret | NodeKind::Identity))
614    }
615
616    /// All nodes of a given kind.
617    pub fn nodes_of_kind(&self, kind: NodeKind) -> impl Iterator<Item = &Node> {
618        self.nodes.iter().filter(move |n| n.kind == kind)
619    }
620
621    /// All nodes in a given trust zone.
622    pub fn nodes_in_zone(&self, zone: TrustZone) -> impl Iterator<Item = &Node> {
623        self.nodes.iter().filter(move |n| n.trust_zone == zone)
624    }
625
626    /// Get a node by ID.
627    pub fn node(&self, id: NodeId) -> Option<&Node> {
628        self.nodes.get(id)
629    }
630
631    /// Get an edge by ID.
632    pub fn edge(&self, id: EdgeId) -> Option<&Edge> {
633        self.edges.get(id)
634    }
635}
636
637#[cfg(test)]
638mod tests {
639    use super::*;
640
641    #[test]
642    fn build_simple_graph() {
643        let mut g = AuthorityGraph::new(PipelineSource {
644            file: "deploy.yml".into(),
645            repo: None,
646            git_ref: None,
647            commit_sha: None,
648        });
649
650        let secret = g.add_node(NodeKind::Secret, "AWS_KEY", TrustZone::FirstParty);
651        let step_build = g.add_node(NodeKind::Step, "build", TrustZone::FirstParty);
652        let artifact = g.add_node(NodeKind::Artifact, "dist.tar.gz", TrustZone::FirstParty);
653        let step_deploy = g.add_node(NodeKind::Step, "deploy", TrustZone::ThirdParty);
654
655        g.add_edge(step_build, secret, EdgeKind::HasAccessTo);
656        g.add_edge(step_build, artifact, EdgeKind::Produces);
657        g.add_edge(artifact, step_deploy, EdgeKind::Consumes);
658
659        assert_eq!(g.nodes.len(), 4);
660        assert_eq!(g.edges.len(), 3);
661        assert_eq!(g.authority_sources().count(), 1);
662        assert_eq!(g.edges_from(step_build).count(), 2);
663        assert_eq!(g.edges_from(artifact).count(), 1); // Consumes flows artifact -> step
664    }
665
666    #[test]
667    fn completeness_default_is_complete() {
668        let g = AuthorityGraph::new(PipelineSource {
669            file: "test.yml".into(),
670            repo: None,
671            git_ref: None,
672            commit_sha: None,
673        });
674        assert_eq!(g.completeness, AuthorityCompleteness::Complete);
675        assert!(g.completeness_gaps.is_empty());
676    }
677
678    #[test]
679    fn mark_partial_records_reason() {
680        let mut g = AuthorityGraph::new(PipelineSource {
681            file: "test.yml".into(),
682            repo: None,
683            git_ref: None,
684            commit_sha: None,
685        });
686        g.mark_partial("secrets in run: block inferred, not precisely mapped");
687        assert_eq!(g.completeness, AuthorityCompleteness::Partial);
688        assert_eq!(g.completeness_gaps.len(), 1);
689    }
690
691    #[test]
692    fn identity_scope_from_permissions() {
693        assert_eq!(
694            IdentityScope::from_permissions("write-all"),
695            IdentityScope::Broad
696        );
697        assert_eq!(
698            IdentityScope::from_permissions("{ contents: write }"),
699            IdentityScope::Broad
700        );
701        assert_eq!(
702            IdentityScope::from_permissions("{ contents: read }"),
703            IdentityScope::Constrained
704        );
705        assert_eq!(
706            IdentityScope::from_permissions("{ id-token: write }"),
707            IdentityScope::Broad
708        );
709        assert_eq!(IdentityScope::from_permissions(""), IdentityScope::Broad);
710        assert_eq!(
711            IdentityScope::from_permissions("custom-scope"),
712            IdentityScope::Unknown
713        );
714    }
715
716    #[test]
717    fn trust_zone_ordering() {
718        assert!(TrustZone::Untrusted.is_lower_than(&TrustZone::FirstParty));
719        assert!(TrustZone::ThirdParty.is_lower_than(&TrustZone::FirstParty));
720        assert!(TrustZone::Untrusted.is_lower_than(&TrustZone::ThirdParty));
721        assert!(!TrustZone::FirstParty.is_lower_than(&TrustZone::FirstParty));
722    }
723
724    // ── Pin validation (fuzz B3 regression) ─────────────────
725
726    #[test]
727    fn is_sha_pinned_accepts_lowercase_40_hex() {
728        // 40 lowercase hex — the canonical legitimate form.
729        assert!(is_sha_pinned(
730            "actions/checkout@abc1234567890abcdef1234567890abcdef123456"
731        ));
732        // Mixed case is still structurally pinned (legitimate — Git accepts both).
733        assert!(is_sha_pinned(
734            "actions/checkout@ABCDEF1234567890abcdef1234567890ABCDEF12"
735        ));
736    }
737
738    #[test]
739    fn is_sha_pinned_structural_accepts_all_zero() {
740        // Structural check is intentionally permissive — semantic rejection
741        // happens in is_pin_semantically_valid. Documented in B3.
742        assert!(is_sha_pinned(
743            "actions/setup-python@0000000000000000000000000000000000000000"
744        ));
745    }
746
747    #[test]
748    fn is_sha_pinned_rejects_short_or_non_hex() {
749        assert!(!is_sha_pinned("actions/checkout@v4"));
750        assert!(!is_sha_pinned("actions/setup-node@a1b2c3"));
751        // 60 chars but not all hex.
752        assert!(!is_sha_pinned(
753            "actions/checkout@somethingthatlookslikeashabutisntsha1234567890abcdef"
754        ));
755    }
756
757    #[test]
758    fn is_pin_semantically_valid_rejects_all_zero_sha() {
759        // Fuzz B3 reproducer.
760        assert!(!is_pin_semantically_valid(
761            "actions/setup-python@0000000000000000000000000000000000000000"
762        ));
763    }
764
765    #[test]
766    fn is_pin_semantically_valid_accepts_real_looking_sha() {
767        assert!(is_pin_semantically_valid(
768            "actions/checkout@abc1234567890abcdef1234567890abcdef123456"
769        ));
770    }
771
772    #[test]
773    fn is_pin_semantically_valid_rejects_unpinned() {
774        assert!(!is_pin_semantically_valid("actions/checkout@v4"));
775        assert!(!is_pin_semantically_valid("actions/setup-node@a1b2c3"));
776    }
777
778    #[test]
779    fn is_docker_digest_pinned_rejects_truncated() {
780        // Fuzz B3 reproducer: previously accepted, now rejected.
781        assert!(!is_docker_digest_pinned("alpine@sha256:abc"));
782        // 65 chars (one too long).
783        assert!(!is_docker_digest_pinned(
784            "alpine@sha256:abc123def456abc123def456abc123def456abc123def456abc123def456abcde"
785        ));
786        // 63 chars (one short).
787        assert!(!is_docker_digest_pinned(
788            "alpine@sha256:abc123def456abc123def456abc123def456abc123def456abc123def456abc"
789        ));
790    }
791
792    #[test]
793    fn is_docker_digest_pinned_accepts_full_64_lowercase() {
794        // Exactly 64 lowercase hex chars after `@sha256:`.
795        assert!(is_docker_digest_pinned(
796            "alpine@sha256:abc123def456abc123def456abc123def456abc123def456abc123def456abcd"
797        ));
798    }
799
800    #[test]
801    fn is_docker_digest_pinned_rejects_uppercase() {
802        // Docker requires lowercase — uppercase indicates a hand-crafted /
803        // tampered string and should not pass.
804        assert!(!is_docker_digest_pinned(
805            "alpine@sha256:ABC123DEF456ABC123DEF456ABC123DEF456ABC123DEF456ABC123DEF456ABCD"
806        ));
807    }
808
809    #[test]
810    fn is_pin_semantically_valid_rejects_all_zero_docker_digest() {
811        assert!(!is_pin_semantically_valid(
812            "alpine@sha256:0000000000000000000000000000000000000000000000000000000000000000"
813        ));
814    }
815
816    #[test]
817    fn is_pin_semantically_valid_accepts_real_docker_digest() {
818        assert!(is_pin_semantically_valid(
819            "alpine@sha256:abc123def456abc123def456abc123def456abc123def456abc123def456abcd"
820        ));
821    }
822}