Skip to main content

taudit_parse_gitlab/
lib.rs

1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4use serde_yaml::Value;
5use taudit_core::error::TauditError;
6use taudit_core::graph::*;
7// Re-import explicitly to make the new constants visible at a glance.
8#[allow(unused_imports)]
9use taudit_core::graph::{META_DOTENV_FILE, META_ENVIRONMENT_NAME, META_NEEDS, META_SCRIPT_BODY};
10use taudit_core::ports::PipelineParser;
11
12/// GitLab CI YAML parser.
13///
14/// Parses `.gitlab-ci.yml` files into an `AuthorityGraph`. The authority model:
15/// - Each job is a `Step` node.
16/// - `CI_JOB_TOKEN` is a global implicit `Identity` (always present, scope=broad).
17/// - `secrets:` entries emit `Secret` nodes with `HasAccessTo` edges.
18/// - `id_tokens:` entries emit OIDC `Identity` nodes.
19/// - `variables:` entries with credential-pattern names emit `Secret` nodes.
20/// - `image:` and `services:` emit `Image` nodes with `UsesImage` edges.
21/// - `include:` and `extends:` mark the graph `Partial`.
22/// - `rules: if: merge_request_event` and `only: merge_requests` set `META_TRIGGER`.
23pub struct GitlabParser;
24
25/// Reserved top-level keys that are not job definitions.
26const RESERVED: &[&str] = &[
27    "stages",
28    "workflow",
29    "include",
30    "variables",
31    "image",
32    "services",
33    "default",
34    "cache",
35    "before_script",
36    "after_script",
37    "types",
38];
39
40/// Variable name fragments that indicate a credential rather than plain config.
41const CRED_FRAGMENTS: &[&str] = &[
42    "TOKEN",
43    "SECRET",
44    "PASSWORD",
45    "PASSWD",
46    "PRIVATE_KEY",
47    "API_KEY",
48    "APIKEY",
49    "SIGNING_KEY",
50    "ACCESS_KEY",
51    "SERVICE_ACCOUNT",
52    "CERT",
53    "CREDENTIAL",
54];
55
56impl PipelineParser for GitlabParser {
57    fn platform(&self) -> &str {
58        "gitlab-ci"
59    }
60
61    fn parse(&self, content: &str, source: &PipelineSource) -> Result<AuthorityGraph, TauditError> {
62        let mut de = serde_yaml::Deserializer::from_str(content);
63        let doc = de
64            .next()
65            .ok_or_else(|| TauditError::Parse("empty YAML document".into()))?;
66        let root: Value = Value::deserialize(doc)
67            .map_err(|e| TauditError::Parse(format!("YAML parse error: {e}")))?;
68
69        let mapping = root
70            .as_mapping()
71            .ok_or_else(|| TauditError::Parse("GitLab CI root must be a mapping".into()))?;
72
73        let mut graph = AuthorityGraph::new(source.clone());
74        graph.metadata.insert(META_PLATFORM.into(), "gitlab".into());
75
76        // CI_JOB_TOKEN is always present in every GitLab CI job — it's the built-in
77        // platform token, equivalent to ADO's System.AccessToken or GHA's GITHUB_TOKEN.
78        let mut meta = HashMap::new();
79        meta.insert(META_IDENTITY_SCOPE.into(), "broad".into());
80        meta.insert(META_IMPLICIT.into(), "true".into());
81        let token_id = graph.add_node_with_metadata(
82            NodeKind::Identity,
83            "CI_JOB_TOKEN",
84            TrustZone::FirstParty,
85            meta,
86        );
87
88        // Top-level include: → mark Partial immediately AND capture each
89        // entry's structure as graph metadata so include-pinning rules can
90        // reason about remote URLs and unpinned project refs.
91        if let Some(inc) = mapping.get("include") {
92            graph.mark_partial(
93                "include: directive present — included templates not resolved".to_string(),
94            );
95            let entries = extract_include_entries(inc);
96            if !entries.is_empty() {
97                if let Ok(json) = serde_json::to_string(&entries) {
98                    graph.metadata.insert(META_GITLAB_INCLUDES.into(), json);
99                }
100            }
101        }
102
103        // Global variables
104        let global_secrets = process_variables(mapping.get("variables"), &mut graph, "pipeline");
105
106        // Global image
107        let global_image = mapping.get("image").and_then(extract_image_str);
108
109        // Top-level merge_request trigger detection from `workflow:` rules
110        if let Some(wf) = mapping.get("workflow") {
111            if has_mr_trigger_in_workflow(wf) {
112                graph
113                    .metadata
114                    .insert(META_TRIGGER.into(), "merge_request".into());
115            }
116        }
117
118        // Process each job (any top-level key not in RESERVED)
119        for (key, value) in mapping {
120            let job_name = match key.as_str() {
121                Some(k) => k,
122                None => continue,
123            };
124            if RESERVED.contains(&job_name) {
125                continue;
126            }
127
128            // Hidden jobs (starting with a dot) are templates — mark Partial, skip
129            if job_name.starts_with('.') {
130                graph.mark_partial(format!(
131                    "job '{job_name}' is a hidden/template job — not resolved"
132                ));
133                continue;
134            }
135
136            let job_map = match value.as_mapping() {
137                Some(m) => m,
138                None => continue,
139            };
140
141            // extends: — job template inheritance, can't resolve statically
142            let extends_names = extract_extends_list(job_map.get("extends"));
143            if !extends_names.is_empty() {
144                graph.mark_partial(format!(
145                    "job '{job_name}' uses extends: — inherited configuration not resolved"
146                ));
147            }
148
149            // Detect PR/MR trigger in this job's rules: or only:
150            let job_triggers_mr = job_has_mr_trigger(job_map);
151
152            // Propagate job MR trigger to graph level
153            if job_triggers_mr && !graph.metadata.contains_key(META_TRIGGER) {
154                graph
155                    .metadata
156                    .insert(META_TRIGGER.into(), "merge_request".into());
157            }
158
159            // Job-level variables
160            let job_secrets = process_variables(job_map.get("variables"), &mut graph, job_name);
161
162            // Job-level explicit secrets: (Vault, AWS Secrets Manager, GCP, Azure)
163            let explicit_secrets =
164                process_explicit_secrets(job_map.get("secrets"), job_name, &mut graph);
165
166            // Job-level OIDC tokens (id_tokens:)
167            let oidc_identities = process_id_tokens(job_map.get("id_tokens"), job_name, &mut graph);
168
169            // Job image (falls back to global)
170            let job_image_str = job_map
171                .get("image")
172                .and_then(extract_image_str)
173                .or(global_image.as_deref().map(String::from));
174
175            let image_id = job_image_str.as_deref().map(|img| {
176                let pinned = is_docker_digest_pinned(img);
177                let trust_zone = if pinned {
178                    TrustZone::ThirdParty
179                } else {
180                    TrustZone::Untrusted
181                };
182                let mut imeta = HashMap::new();
183                if let Some(digest) = img.split("@sha256:").nth(1) {
184                    imeta.insert(META_DIGEST.into(), format!("sha256:{digest}"));
185                }
186                graph.add_node_with_metadata(NodeKind::Image, img, trust_zone, imeta)
187            });
188
189            // Services (each is an Image node)
190            let service_ids = process_services(job_map.get("services"), &mut graph);
191
192            // Environment — record name as metadata, sets trust boundary marker
193            let env_name = job_map
194                .get("environment")
195                .and_then(extract_environment_name);
196            let env_url = job_map.get("environment").and_then(extract_environment_url);
197
198            // Concatenated script body (before_script + script + after_script).
199            // Stamped on the Step node so script-aware rules (notably
200            // `untrusted_ci_var_in_shell_interpolation` and
201            // `ci_job_token_to_external_api`) can pattern-match without
202            // re-walking the YAML.
203            // Inline script body — concatenate before_script, script, after_script
204            // (each may be a string or a list-of-strings). Stamped on the Step so
205            // script-aware rules can pattern-match without re-parsing YAML.
206            let script_body = extract_script_body(job_map);
207
208            // GitLab `artifacts.reports.dotenv: <file>` — when set, the file's
209            // KEY=value lines are silently promoted to pipeline variables for
210            // any downstream job that consumes this one via `needs:` /
211            // `dependencies:`. Required input to
212            // `dotenv_artifact_flows_to_privileged_deployment`.
213            let dotenv_file = extract_dotenv_file(job_map);
214
215            // Upstream job names consumed via `needs:` / `dependencies:`.
216            // Used to build dotenv-flow chains across stages.
217            let needs = extract_needs(job_map);
218
219            // Detect whether this job's `rules:` / `only:` clause restricts
220            // execution to protected branches (or to the default branch,
221            // which is protected by GitLab default policy). Used by the
222            // `gitlab_deploy_job_missing_protected_branch_only` rule to
223            // detect deployment jobs that lack any branch guard.
224            let protected_only = job_has_protected_branch_restriction(job_map);
225
226            // Create the Step node for this job
227            let mut step_meta = HashMap::new();
228            step_meta.insert(META_JOB_NAME.into(), job_name.to_string());
229            if let Some(ref env) = env_name {
230                step_meta.insert(META_ENVIRONMENT_NAME.into(), env.clone());
231            }
232            if !script_body.is_empty() {
233                step_meta.insert(META_SCRIPT_BODY.into(), script_body);
234            }
235            if let Some(ref f) = dotenv_file {
236                step_meta.insert(META_DOTENV_FILE.into(), f.clone());
237            }
238            if !needs.is_empty() {
239                step_meta.insert(META_NEEDS.into(), needs.join(","));
240            }
241            if let Some(ref url) = env_url {
242                step_meta.insert(META_ENVIRONMENT_URL.into(), url.clone());
243            }
244            // Per-step MR trigger marker — graph-level META_TRIGGER applies to
245            // the file as a whole, but `id_token_audience_overscoped` needs to
246            // compare audience usage between MR-context and protected-context
247            // jobs in the same file.
248            if job_triggers_mr {
249                step_meta.insert(META_TRIGGER.into(), "merge_request".into());
250            }
251            // extends: list (comma-joined, in source order)
252            if !extends_names.is_empty() {
253                step_meta.insert(META_GITLAB_EXTENDS.into(), extends_names.join(","));
254            }
255            // allow_failure: true|false (only stamp when explicitly set so the
256            // rule can distinguish "absent" from "false")
257            if let Some(af) = job_map.get("allow_failure").and_then(|v| v.as_bool()) {
258                step_meta.insert(META_GITLAB_ALLOW_FAILURE.into(), af.to_string());
259            } else if job_map
260                .get("allow_failure")
261                .and_then(|v| v.as_mapping())
262                .is_some()
263            {
264                // `allow_failure: { exit_codes: [42] }` — conditional pass; treat
265                // as truthy for silent-skip detection.
266                step_meta.insert(META_GITLAB_ALLOW_FAILURE.into(), "true".into());
267            }
268            // dind sidecar detection: any service whose name matches docker:*-dind
269            if job_services_have_dind(job_map.get("services")) {
270                step_meta.insert(META_GITLAB_DIND_SERVICE.into(), "true".into());
271            }
272            // trigger: block — child / downstream pipeline
273            if let Some(kind) = classify_trigger(job_map.get("trigger")) {
274                step_meta.insert(META_GITLAB_TRIGGER_KIND.into(), kind.into());
275            }
276            // cache: structural capture (key + policy)
277            if let Some((cache_key, cache_policy)) = extract_cache_key_policy(job_map.get("cache"))
278            {
279                step_meta.insert(META_GITLAB_CACHE_KEY.into(), cache_key);
280                if let Some(p) = cache_policy {
281                    step_meta.insert(META_GITLAB_CACHE_POLICY.into(), p);
282                }
283            }
284            if protected_only {
285                step_meta.insert(META_RULES_PROTECTED_ONLY.into(), "true".into());
286            }
287            let step_id = graph.add_node_with_metadata(
288                NodeKind::Step,
289                job_name,
290                TrustZone::FirstParty,
291                step_meta,
292            );
293
294            // CI_JOB_TOKEN always available to every step
295            graph.add_edge(step_id, token_id, EdgeKind::HasAccessTo);
296
297            // Link all secrets
298            for &sid in global_secrets
299                .iter()
300                .chain(&job_secrets)
301                .chain(&explicit_secrets)
302            {
303                graph.add_edge(step_id, sid, EdgeKind::HasAccessTo);
304            }
305
306            // Link OIDC identities
307            for &iid in &oidc_identities {
308                graph.add_edge(step_id, iid, EdgeKind::HasAccessTo);
309            }
310
311            // UsesImage edges
312            if let Some(img_id) = image_id {
313                graph.add_edge(step_id, img_id, EdgeKind::UsesImage);
314            }
315            for &svc_id in &service_ids {
316                graph.add_edge(step_id, svc_id, EdgeKind::UsesImage);
317            }
318        }
319
320        // Cross-platform misclassification trap (red-team R2 #5): a YAML file
321        // with non-reserved top-level keys looks like a GitLab pipeline shape
322        // but its body may use constructs the GitLab parser doesn't recognise
323        // (e.g. an ADO `task:` payload). Mark Partial when the source had at
324        // least one job-shaped top-level key but we ended up with no Step
325        // nodes — better than silently returning completeness=complete on a
326        // clean-but-empty graph that a CI gate would treat as "passed".
327        let step_count = graph
328            .nodes
329            .iter()
330            .filter(|n| n.kind == NodeKind::Step)
331            .count();
332        let had_job_carrier = mapping.iter().any(|(k, v)| {
333            k.as_str()
334                .map(|name| !RESERVED.contains(&name) && !name.starts_with('.'))
335                .unwrap_or(false)
336                && v.as_mapping().is_some()
337        });
338        if step_count == 0 && had_job_carrier {
339            graph.mark_partial(
340                "non-reserved top-level keys parsed but produced 0 step nodes — possible non-GitLab YAML wrong-platform-classified".to_string(),
341            );
342        }
343
344        Ok(graph)
345    }
346}
347
348/// Detect `image:` string from a YAML value — can be a bare string or a mapping with `name:`.
349fn extract_image_str(v: &Value) -> Option<String> {
350    match v {
351        Value::String(s) => Some(s.clone()),
352        Value::Mapping(m) => m.get("name").and_then(|n| n.as_str()).map(String::from),
353        _ => None,
354    }
355}
356
357/// Extract environment name from `environment:` value (string or mapping).
358fn extract_environment_name(v: &Value) -> Option<String> {
359    match v {
360        Value::String(s) => Some(s.clone()),
361        Value::Mapping(m) => m.get("name").and_then(|n| n.as_str()).map(String::from),
362        _ => None,
363    }
364}
365
366/// Extract `environment:url:` value (only present when environment is a mapping).
367fn extract_environment_url(v: &Value) -> Option<String> {
368    match v {
369        Value::Mapping(m) => m.get("url").and_then(|u| u.as_str()).map(String::from),
370        _ => None,
371    }
372}
373
374/// Concatenate `before_script`, `script`, and `after_script` of a job into one
375/// string body (separated by newlines). Each section may be a single string or
376/// a list of strings. Empty sections are skipped.
377fn extract_script_body(job_map: &serde_yaml::Mapping) -> String {
378    let mut lines: Vec<String> = Vec::new();
379    for key in &["before_script", "script", "after_script"] {
380        if let Some(v) = job_map.get(*key) {
381            collect_script_lines(v, &mut lines);
382        }
383    }
384    lines.join("\n")
385}
386
387/// Append script lines from a YAML value (string or sequence of strings).
388fn collect_script_lines(v: &Value, out: &mut Vec<String>) {
389    match v {
390        Value::String(s) => out.push(s.clone()),
391        Value::Sequence(seq) => {
392            for item in seq {
393                if let Some(s) = item.as_str() {
394                    out.push(s.to_string());
395                }
396            }
397        }
398        _ => {}
399    }
400}
401
402/// Extract `artifacts.reports.dotenv` filename. Value may be a single string
403/// or a list of strings — for the list form we join with `,`.
404fn extract_dotenv_file(job_map: &serde_yaml::Mapping) -> Option<String> {
405    let dotenv = job_map
406        .get("artifacts")?
407        .as_mapping()?
408        .get("reports")?
409        .as_mapping()?
410        .get("dotenv")?;
411    match dotenv {
412        Value::String(s) => Some(s.clone()),
413        Value::Sequence(seq) => {
414            let parts: Vec<String> = seq
415                .iter()
416                .filter_map(|v| v.as_str().map(String::from))
417                .collect();
418            if parts.is_empty() {
419                None
420            } else {
421                Some(parts.join(","))
422            }
423        }
424        _ => None,
425    }
426}
427
428/// Extract upstream job names from `needs:` and `dependencies:`.
429/// `needs:` may be a list of strings or a list of mappings with `job:`.
430/// `dependencies:` is a list of strings.
431fn extract_needs(job_map: &serde_yaml::Mapping) -> Vec<String> {
432    let mut out: Vec<String> = Vec::new();
433    if let Some(needs) = job_map.get("needs").and_then(|v| v.as_sequence()) {
434        for item in needs {
435            match item {
436                Value::String(s) => out.push(s.clone()),
437                Value::Mapping(m) => {
438                    if let Some(j) = m.get("job").and_then(|j| j.as_str()) {
439                        out.push(j.to_string());
440                    }
441                }
442                _ => {}
443            }
444        }
445    }
446    if let Some(deps) = job_map.get("dependencies").and_then(|v| v.as_sequence()) {
447        for item in deps {
448            if let Some(s) = item.as_str() {
449                out.push(s.to_string());
450            }
451        }
452    }
453    out.sort();
454    out.dedup();
455    out
456}
457
458/// Classify a variable name as a credential by checking for common fragments.
459fn is_credential_name(name: &str) -> bool {
460    let upper = name.to_uppercase();
461    CRED_FRAGMENTS.iter().any(|frag| upper.contains(frag))
462}
463
464/// Parse `variables:` mapping and emit `Secret` nodes for credential-pattern names.
465/// Returns the list of created node IDs.
466fn process_variables(vars: Option<&Value>, graph: &mut AuthorityGraph, scope: &str) -> Vec<NodeId> {
467    let mut ids = Vec::new();
468    let map = match vars.and_then(|v| v.as_mapping()) {
469        Some(m) => m,
470        None => return ids,
471    };
472    for (k, _v) in map {
473        let name = match k.as_str() {
474            Some(s) => s,
475            None => continue,
476        };
477        if is_credential_name(name) {
478            let id = graph.add_node(NodeKind::Secret, name, TrustZone::FirstParty);
479            ids.push(id);
480            let _ = scope; // used for future scoped error messages
481        }
482    }
483    ids
484}
485
486/// Parse `secrets:` block and emit one `Secret` node per named secret.
487///
488/// GitLab CI `secrets:` format:
489/// ```yaml
490/// secrets:
491///   DATABASE_PASSWORD:
492///     vault: production/db/password@secret
493///   AWS_KEY:
494///     aws_secrets_manager:
495///       name: my-secret
496/// ```
497fn process_explicit_secrets(
498    secrets: Option<&Value>,
499    _scope: &str,
500    graph: &mut AuthorityGraph,
501) -> Vec<NodeId> {
502    let mut ids = Vec::new();
503    let map = match secrets.and_then(|v| v.as_mapping()) {
504        Some(m) => m,
505        None => return ids,
506    };
507    for (k, _v) in map {
508        let name = match k.as_str() {
509            Some(s) => s,
510            None => continue,
511        };
512        let id = graph.add_node(NodeKind::Secret, name, TrustZone::FirstParty);
513        ids.push(id);
514    }
515    ids
516}
517
518/// Parse `id_tokens:` block and emit one OIDC `Identity` node per token.
519///
520/// GitLab CI `id_tokens:` format:
521/// ```yaml
522/// id_tokens:
523///   SIGSTORE_ID_TOKEN:
524///     aud: sigstore
525///   AWS_OIDC_TOKEN:
526///     aud: https://sts.amazonaws.com
527/// ```
528fn process_id_tokens(
529    id_tokens: Option<&Value>,
530    _scope: &str,
531    graph: &mut AuthorityGraph,
532) -> Vec<NodeId> {
533    let mut ids = Vec::new();
534    let map = match id_tokens.and_then(|v| v.as_mapping()) {
535        Some(m) => m,
536        None => return ids,
537    };
538    for (k, v) in map {
539        let token_name = match k.as_str() {
540            Some(s) => s,
541            None => continue,
542        };
543        // Extract audience for labelling and as discrete metadata
544        // (rules like `id_token_audience_overscoped` need to compare audiences
545        // across jobs without re-parsing the label).
546        let aud = v
547            .as_mapping()
548            .and_then(|m| m.get("aud"))
549            .and_then(|a| a.as_str())
550            .unwrap_or("unknown");
551        let label = format!("{token_name} (aud={aud})");
552        let mut meta = HashMap::new();
553        meta.insert(META_OIDC.into(), "true".into());
554        meta.insert(META_IDENTITY_SCOPE.into(), "broad".into());
555        meta.insert(META_OIDC_AUDIENCE.into(), aud.to_string());
556        let id =
557            graph.add_node_with_metadata(NodeKind::Identity, label, TrustZone::FirstParty, meta);
558        ids.push(id);
559    }
560    ids
561}
562
563/// Parse `services:` block and emit `Image` nodes.
564fn process_services(services: Option<&Value>, graph: &mut AuthorityGraph) -> Vec<NodeId> {
565    let mut ids = Vec::new();
566    let list = match services.and_then(|v| v.as_sequence()) {
567        Some(s) => s,
568        None => return ids,
569    };
570    for item in list {
571        let img_str = match extract_image_str(item) {
572            Some(s) => s,
573            None => continue,
574        };
575        let pinned = is_docker_digest_pinned(&img_str);
576        let trust_zone = if pinned {
577            TrustZone::ThirdParty
578        } else {
579            TrustZone::Untrusted
580        };
581        let mut meta = HashMap::new();
582        if let Some(digest) = img_str.split("@sha256:").nth(1) {
583            meta.insert(META_DIGEST.into(), format!("sha256:{digest}"));
584        }
585        let id = graph.add_node_with_metadata(NodeKind::Image, &img_str, trust_zone, meta);
586        ids.push(id);
587    }
588    ids
589}
590
591/// Check whether a job's `rules:` or `only:` indicates it runs on merge requests.
592fn job_has_mr_trigger(job_map: &serde_yaml::Mapping) -> bool {
593    // rules: [{if: '$CI_PIPELINE_SOURCE == "merge_request_event"'}]
594    if let Some(rules) = job_map.get("rules").and_then(|v| v.as_sequence()) {
595        for rule in rules {
596            if let Some(if_expr) = rule
597                .as_mapping()
598                .and_then(|m| m.get("if"))
599                .and_then(|v| v.as_str())
600            {
601                if if_expr.contains("merge_request_event") {
602                    return true;
603                }
604            }
605        }
606    }
607    // only: [merge_requests] or only: {refs: [merge_requests]}
608    if let Some(only) = job_map.get("only") {
609        if only_has_merge_requests(only) {
610            return true;
611        }
612    }
613    false
614}
615
616/// Check `only:` value (sequence or mapping) for `merge_requests` entry.
617fn only_has_merge_requests(v: &Value) -> bool {
618    match v {
619        Value::Sequence(seq) => seq
620            .iter()
621            .any(|item| item.as_str() == Some("merge_requests")),
622        Value::Mapping(m) => {
623            if let Some(refs) = m.get("refs").and_then(|r| r.as_sequence()) {
624                return refs
625                    .iter()
626                    .any(|item| item.as_str() == Some("merge_requests"));
627            }
628            false
629        }
630        _ => false,
631    }
632}
633
634/// Returns true when a job's `rules:` or `only:` clause restricts execution
635/// to protected refs only. The set of accepted patterns is intentionally
636/// generous because the goal is to *credit* defensive intent, not to
637/// audit-grade verify that every protection actually exists in GitLab's
638/// branch-protection settings — that lives outside the YAML.
639///
640/// Patterns recognised as a protected-only restriction:
641///
642///   * any `rules: [{ if: ... $CI_COMMIT_REF_PROTECTED ... }]`
643///   * any `rules: [{ if: ... $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH ... }]`
644///     (default branch is GitLab-protected by default)
645///   * any `rules: [{ if: ... $CI_COMMIT_TAG ... }]` (tags are protected by default)
646///   * `only: [main]` / `only: [master]` / `only: tags`
647///   * `only: { refs: [main, /^release/.*/] }`
648///
649/// Hits any one of the above → true. Misses every one → false.
650fn job_has_protected_branch_restriction(job_map: &serde_yaml::Mapping) -> bool {
651    if let Some(rules) = job_map.get("rules").and_then(|v| v.as_sequence()) {
652        for rule in rules {
653            let Some(if_expr) = rule
654                .as_mapping()
655                .and_then(|m| m.get("if"))
656                .and_then(|v| v.as_str())
657            else {
658                continue;
659            };
660            if if_expr.contains("$CI_COMMIT_REF_PROTECTED")
661                || if_expr.contains("CI_COMMIT_REF_PROTECTED")
662            {
663                return true;
664            }
665            if if_expr.contains("$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH")
666                || if_expr.contains("$CI_DEFAULT_BRANCH == $CI_COMMIT_BRANCH")
667            {
668                return true;
669            }
670            if if_expr.contains("$CI_COMMIT_TAG") {
671                return true;
672            }
673        }
674    }
675    if let Some(only) = job_map.get("only") {
676        if only_lists_protected_ref(only) {
677            return true;
678        }
679    }
680    false
681}
682
683/// Check `only:` for protected/default-branch refs (`main`, `master`, `tags`,
684/// or a `refs:` list containing those). Conservative — does NOT include
685/// `merge_requests` (that's the opposite signal).
686fn only_lists_protected_ref(v: &Value) -> bool {
687    fn is_protected_ref(s: &str) -> bool {
688        matches!(s, "main" | "master" | "tags") || s.starts_with("/^release")
689    }
690    match v {
691        Value::String(s) => is_protected_ref(s.as_str()),
692        Value::Sequence(seq) => seq
693            .iter()
694            .any(|item| item.as_str().map(is_protected_ref).unwrap_or(false)),
695        Value::Mapping(m) => {
696            if let Some(refs) = m.get("refs").and_then(|r| r.as_sequence()) {
697                return refs
698                    .iter()
699                    .any(|item| item.as_str().map(is_protected_ref).unwrap_or(false));
700            }
701            false
702        }
703        _ => false,
704    }
705}
706
707/// Check top-level `workflow:` rules for MR trigger.
708fn has_mr_trigger_in_workflow(wf: &Value) -> bool {
709    let rules = match wf
710        .as_mapping()
711        .and_then(|m| m.get("rules"))
712        .and_then(|r| r.as_sequence())
713    {
714        Some(r) => r,
715        None => return false,
716    };
717    for rule in rules {
718        if let Some(if_expr) = rule
719            .as_mapping()
720            .and_then(|m| m.get("if"))
721            .and_then(|v| v.as_str())
722        {
723            if if_expr.contains("merge_request_event") {
724                return true;
725            }
726        }
727    }
728    false
729}
730
731/// Structured representation of a single `include:` entry.
732///
733/// Serialised into `AuthorityGraph::metadata[META_GITLAB_INCLUDES]` so that
734/// downstream rules (e.g. `unpinned_include_remote_or_branch_ref`) can analyse
735/// remote-URL pins, project refs, and missing `ref:` defaults without re-parsing
736/// the YAML.
737#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
738pub struct IncludeEntry {
739    /// Include kind: `local`, `remote`, `template`, `project`, `component`, or
740    /// `unknown` for shapes we don't recognise.
741    pub kind: String,
742    /// The path / URL / project string the include points at.
743    pub target: String,
744    /// The resolved `ref:` value. Empty string when the include omits a `ref:`
745    /// (defaults to HEAD on the source repo, which is itself a finding).
746    pub git_ref: String,
747}
748
749/// Parse the top-level `include:` value into a flat list of `IncludeEntry`s.
750///
751/// `include:` accepts five shapes — string, sequence-of-strings, sequence-of-mappings,
752/// sequence-of-strings-mixed-with-mappings, and a single mapping. Normalise all of
753/// them into one flat list so the rule layer doesn't have to.
754pub fn extract_include_entries(v: &Value) -> Vec<IncludeEntry> {
755    let mut out = Vec::new();
756    match v {
757        // `include: 'path/to/local.yml'` — sugar for a local include
758        Value::String(s) => {
759            out.push(IncludeEntry {
760                kind: classify_string_include(s).into(),
761                target: s.clone(),
762                git_ref: String::new(),
763            });
764        }
765        Value::Sequence(seq) => {
766            for item in seq {
767                match item {
768                    Value::String(s) => {
769                        out.push(IncludeEntry {
770                            kind: classify_string_include(s).into(),
771                            target: s.clone(),
772                            git_ref: String::new(),
773                        });
774                    }
775                    Value::Mapping(m) => {
776                        if let Some(e) = include_entry_from_mapping(m) {
777                            out.push(e);
778                        }
779                    }
780                    _ => {}
781                }
782            }
783        }
784        Value::Mapping(m) => {
785            if let Some(e) = include_entry_from_mapping(m) {
786                out.push(e);
787            }
788        }
789        _ => {}
790    }
791    out
792}
793
794/// Heuristic: a top-level `include:` string that looks like an HTTPS URL is a
795/// `remote:` include in shorthand form; everything else is a `local:` path.
796fn classify_string_include(s: &str) -> &'static str {
797    let lower = s.to_ascii_lowercase();
798    if lower.starts_with("http://") || lower.starts_with("https://") {
799        "remote"
800    } else {
801        "local"
802    }
803}
804
805/// Lift one of the four mapping forms (`local:`, `remote:`, `template:`,
806/// `project:`, `component:`) into an `IncludeEntry`. Returns None when the
807/// mapping has none of the recognised keys.
808fn include_entry_from_mapping(m: &serde_yaml::Mapping) -> Option<IncludeEntry> {
809    let str_at = |key: &str| {
810        m.get(key)
811            .and_then(|v| v.as_str())
812            .map(str::to_string)
813            .unwrap_or_default()
814    };
815    if let Some(s) = m.get("local").and_then(|v| v.as_str()) {
816        return Some(IncludeEntry {
817            kind: "local".into(),
818            target: s.to_string(),
819            git_ref: String::new(),
820        });
821    }
822    if let Some(s) = m.get("remote").and_then(|v| v.as_str()) {
823        return Some(IncludeEntry {
824            kind: "remote".into(),
825            target: s.to_string(),
826            git_ref: String::new(),
827        });
828    }
829    if let Some(s) = m.get("template").and_then(|v| v.as_str()) {
830        return Some(IncludeEntry {
831            kind: "template".into(),
832            target: s.to_string(),
833            git_ref: String::new(),
834        });
835    }
836    if let Some(s) = m.get("component").and_then(|v| v.as_str()) {
837        // GitLab CI/CD components: source@version → version is the pin
838        let (target, git_ref) = match s.rsplit_once('@') {
839            Some((path, ver)) => (path.to_string(), ver.to_string()),
840            None => (s.to_string(), String::new()),
841        };
842        return Some(IncludeEntry {
843            kind: "component".into(),
844            target,
845            git_ref,
846        });
847    }
848    if m.contains_key("project") {
849        let project = str_at("project");
850        // ref: may be missing → empty string indicates HEAD/default branch,
851        // which is itself a supply-chain finding.
852        let git_ref = str_at("ref");
853        return Some(IncludeEntry {
854            kind: "project".into(),
855            target: project,
856            git_ref,
857        });
858    }
859    None
860}
861
862/// Extract a flat list of template names from an `extends:` value.
863/// `extends:` accepts a single string or a sequence of strings.
864fn extract_extends_list(v: Option<&Value>) -> Vec<String> {
865    let v = match v {
866        Some(v) => v,
867        None => return Vec::new(),
868    };
869    match v {
870        Value::String(s) => vec![s.clone()],
871        Value::Sequence(seq) => seq
872            .iter()
873            .filter_map(|i| i.as_str().map(str::to_string))
874            .collect(),
875        _ => Vec::new(),
876    }
877}
878
879/// Returns true when any entry in `services:` has an image name matching
880/// `docker:*-dind` (or bare `docker:dind`). Recognises both shapes:
881/// `services: [docker:dind]` and `services: [{name: docker:dind}]`.
882fn job_services_have_dind(services: Option<&Value>) -> bool {
883    let list = match services.and_then(|v| v.as_sequence()) {
884        Some(s) => s,
885        None => return false,
886    };
887    for item in list {
888        let img = match extract_image_str(item) {
889            Some(s) => s,
890            None => continue,
891        };
892        if image_is_dind(&img) {
893            return true;
894        }
895    }
896    false
897}
898
899/// Match `docker:dind`, `docker:24.0-dind`, `docker:24-dind`,
900/// `docker:24.0.7-dind-rootless`, etc. The discriminator is a `docker:` prefix
901/// AND `dind` appearing somewhere in the tag.
902fn image_is_dind(image: &str) -> bool {
903    let lower = image.to_ascii_lowercase();
904    // Match the official docker dind images and their digest-pinned variants.
905    // Strip any `@sha256:...` suffix before checking the tag.
906    let bare = match lower.split_once('@') {
907        Some((b, _)) => b,
908        None => &lower,
909    };
910    if !bare.starts_with("docker:") && !bare.starts_with("docker/") {
911        return false;
912    }
913    bare.contains("dind")
914}
915
916/// Classify a `trigger:` block as either `static` (in-tree YAML / fixed
917/// downstream project) or `dynamic` (include from a previous job's artifact —
918/// dynamic child pipelines, the code-injection sink). Returns None when no
919/// `trigger:` block is present.
920fn classify_trigger(trigger: Option<&Value>) -> Option<&'static str> {
921    let t = trigger?;
922    // Shorthand: `trigger: my/downstream/project` → static
923    if t.is_string() {
924        return Some("static");
925    }
926    let m = t.as_mapping()?;
927    // Look at every `include:` entry under trigger; if ANY one references an
928    // `artifact:` field, the child pipeline is dynamic.
929    if let Some(inc) = m.get("include") {
930        if include_has_artifact_source(inc) {
931            return Some("dynamic");
932        }
933    }
934    Some("static")
935}
936
937/// Walk a `trigger.include:` value (string / sequence / mapping) and return
938/// true when any entry's mapping carries an `artifact:` key.
939fn include_has_artifact_source(v: &Value) -> bool {
940    match v {
941        Value::Mapping(m) => m.contains_key("artifact"),
942        Value::Sequence(seq) => seq.iter().any(|i| {
943            i.as_mapping()
944                .map(|m| m.contains_key("artifact"))
945                .unwrap_or(false)
946        }),
947        _ => false,
948    }
949}
950
951/// Extract `(cache.key, cache.policy)` from a job's `cache:` value. Returns
952/// `None` when no cache is declared. `cache:` may be a sequence of mappings
953/// (multiple caches); we capture the first key/policy pair so the rule layer
954/// has at least one signal — multi-cache analysis is left to a future
955/// extension.
956///
957/// `cache.key:` may be:
958/// - a string: `key: vendor`
959/// - a mapping: `key: { files: [Gemfile.lock] }` → captured as `files:Gemfile.lock,...`
960/// - a mapping with `prefix:` → captured as `prefix:<value>`
961fn extract_cache_key_policy(v: Option<&Value>) -> Option<(String, Option<String>)> {
962    let v = v?;
963    let m = match v {
964        Value::Mapping(m) => m,
965        Value::Sequence(seq) => {
966            // First cache wins — same heuristic used elsewhere.
967            return seq
968                .iter()
969                .find_map(|i| i.as_mapping().and_then(extract_cache_key_policy_map));
970        }
971        _ => return None,
972    };
973    extract_cache_key_policy_map(m)
974}
975
976fn extract_cache_key_policy_map(m: &serde_yaml::Mapping) -> Option<(String, Option<String>)> {
977    let key = match m.get("key") {
978        Some(Value::String(s)) => s.clone(),
979        Some(Value::Number(n)) => n.to_string(),
980        Some(Value::Bool(b)) => b.to_string(),
981        Some(Value::Mapping(km)) => {
982            let mut parts = Vec::new();
983            if let Some(prefix) = km.get("prefix").and_then(|v| v.as_str()) {
984                parts.push(format!("prefix:{prefix}"));
985            }
986            if let Some(files) = km.get("files").and_then(|v| v.as_sequence()) {
987                let names: Vec<String> = files
988                    .iter()
989                    .filter_map(|f| f.as_str().map(str::to_string))
990                    .collect();
991                if !names.is_empty() {
992                    parts.push(format!("files:{}", names.join(",")));
993                }
994            }
995            if parts.is_empty() {
996                String::new()
997            } else {
998                parts.join(";")
999            }
1000        }
1001        _ => String::new(),
1002    };
1003    let policy = m.get("policy").and_then(|v| v.as_str()).map(str::to_string);
1004    Some((key, policy))
1005}
1006
1007#[cfg(test)]
1008mod tests {
1009    use super::*;
1010
1011    fn parse(yaml: &str) -> AuthorityGraph {
1012        let parser = GitlabParser;
1013        let source = PipelineSource {
1014            file: ".gitlab-ci.yml".into(),
1015            repo: None,
1016            git_ref: None,
1017            commit_sha: None,
1018        };
1019        parser.parse(yaml, &source).unwrap()
1020    }
1021
1022    #[test]
1023    fn ci_job_token_always_present() {
1024        let yaml = r#"
1025stages:
1026  - build
1027
1028build-job:
1029  stage: build
1030  script:
1031    - make build
1032"#;
1033        let graph = parse(yaml);
1034        let identities: Vec<_> = graph.nodes_of_kind(NodeKind::Identity).collect();
1035        assert_eq!(identities.len(), 1);
1036        assert_eq!(identities[0].name, "CI_JOB_TOKEN");
1037        assert_eq!(
1038            identities[0]
1039                .metadata
1040                .get(META_IMPLICIT)
1041                .map(String::as_str),
1042            Some("true")
1043        );
1044        assert_eq!(
1045            identities[0]
1046                .metadata
1047                .get(META_IDENTITY_SCOPE)
1048                .map(String::as_str),
1049            Some("broad")
1050        );
1051    }
1052
1053    #[test]
1054    fn global_credential_variable_emits_secret_node() {
1055        let yaml = r#"
1056variables:
1057  APP_VERSION: "1.0"
1058  DEPLOY_TOKEN: "$CI_DEPLOY_TOKEN"
1059
1060build-job:
1061  script:
1062    - make
1063"#;
1064        let graph = parse(yaml);
1065        let secrets: Vec<_> = graph.nodes_of_kind(NodeKind::Secret).collect();
1066        assert!(
1067            secrets.iter().any(|s| s.name == "DEPLOY_TOKEN"),
1068            "DEPLOY_TOKEN must emit a Secret node, got: {:?}",
1069            secrets.iter().map(|s| &s.name).collect::<Vec<_>>()
1070        );
1071        // Plain config variable must not emit Secret
1072        assert!(
1073            !secrets.iter().any(|s| s.name == "APP_VERSION"),
1074            "APP_VERSION must not emit a Secret node"
1075        );
1076    }
1077
1078    #[test]
1079    fn floating_image_emits_untrusted_image_node() {
1080        let yaml = r#"
1081deploy:
1082  image: alpine:latest
1083  script:
1084    - deploy.sh
1085"#;
1086        let graph = parse(yaml);
1087        let images: Vec<_> = graph.nodes_of_kind(NodeKind::Image).collect();
1088        assert_eq!(images.len(), 1);
1089        assert_eq!(images[0].name, "alpine:latest");
1090        assert_eq!(images[0].trust_zone, TrustZone::Untrusted);
1091    }
1092
1093    #[test]
1094    fn digest_pinned_image_is_third_party() {
1095        let yaml = r#"
1096deploy:
1097  image: "alpine@sha256:a5ac7e51b41094c92402da3b24376905380afc29a5ac7e51b41094c92402da3b"
1098  script:
1099    - deploy.sh
1100"#;
1101        let graph = parse(yaml);
1102        let images: Vec<_> = graph.nodes_of_kind(NodeKind::Image).collect();
1103        assert_eq!(images.len(), 1);
1104        assert_eq!(images[0].trust_zone, TrustZone::ThirdParty);
1105    }
1106
1107    #[test]
1108    fn id_tokens_emit_oidc_identity_nodes() {
1109        let yaml = r#"
1110deploy:
1111  id_tokens:
1112    SIGSTORE_ID_TOKEN:
1113      aud: sigstore
1114    AWS_OIDC_TOKEN:
1115      aud: https://sts.amazonaws.com
1116  script:
1117    - deploy.sh
1118"#;
1119        let graph = parse(yaml);
1120        let oidc: Vec<_> = graph
1121            .nodes_of_kind(NodeKind::Identity)
1122            .filter(|n| n.metadata.get(META_OIDC).map(String::as_str) == Some("true"))
1123            .collect();
1124        assert_eq!(
1125            oidc.len(),
1126            2,
1127            "expected 2 OIDC identity nodes, got: {:?}",
1128            oidc.iter().map(|n| &n.name).collect::<Vec<_>>()
1129        );
1130    }
1131
1132    #[test]
1133    fn explicit_secrets_emit_secret_nodes() {
1134        let yaml = r#"
1135deploy:
1136  secrets:
1137    DATABASE_PASSWORD:
1138      vault: production/db/password@secret
1139    AWS_KEY:
1140      aws_secrets_manager:
1141        name: my-secret
1142  script:
1143    - deploy.sh
1144"#;
1145        let graph = parse(yaml);
1146        let secrets: Vec<_> = graph.nodes_of_kind(NodeKind::Secret).collect();
1147        let names: Vec<_> = secrets.iter().map(|s| s.name.as_str()).collect();
1148        assert!(names.contains(&"DATABASE_PASSWORD"), "got: {names:?}");
1149        assert!(names.contains(&"AWS_KEY"), "got: {names:?}");
1150    }
1151
1152    #[test]
1153    fn rules_mr_trigger_sets_meta_trigger() {
1154        let yaml = r#"
1155test:
1156  rules:
1157    - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
1158  script:
1159    - run tests
1160"#;
1161        let graph = parse(yaml);
1162        assert_eq!(
1163            graph.metadata.get(META_TRIGGER).map(String::as_str),
1164            Some("merge_request"),
1165            "META_TRIGGER must be set to merge_request"
1166        );
1167    }
1168
1169    #[test]
1170    fn only_merge_requests_sets_meta_trigger() {
1171        let yaml = r#"
1172test:
1173  only:
1174    - merge_requests
1175  script:
1176    - run tests
1177"#;
1178        let graph = parse(yaml);
1179        assert_eq!(
1180            graph.metadata.get(META_TRIGGER).map(String::as_str),
1181            Some("merge_request")
1182        );
1183    }
1184
1185    #[test]
1186    fn include_marks_graph_partial() {
1187        let yaml = r#"
1188include:
1189  - local: '/templates/.base.yml'
1190
1191build:
1192  script:
1193    - make
1194"#;
1195        let graph = parse(yaml);
1196        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
1197    }
1198
1199    #[test]
1200    fn extends_marks_graph_partial() {
1201        let yaml = r#"
1202.base:
1203  script:
1204    - echo base
1205
1206my-job:
1207  extends: .base
1208  stage: build
1209"#;
1210        let graph = parse(yaml);
1211        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
1212    }
1213
1214    #[test]
1215    fn meta_job_name_set_on_step_nodes() {
1216        let yaml = r#"
1217build:
1218  script:
1219    - make
1220deploy:
1221  script:
1222    - deploy.sh
1223"#;
1224        let graph = parse(yaml);
1225        let steps: Vec<_> = graph.nodes_of_kind(NodeKind::Step).collect();
1226        assert_eq!(steps.len(), 2);
1227        for step in &steps {
1228            assert!(
1229                step.metadata.contains_key(META_JOB_NAME),
1230                "Step '{}' missing META_JOB_NAME",
1231                step.name
1232            );
1233        }
1234        // Verify job names are correct
1235        let names: Vec<_> = steps
1236            .iter()
1237            .map(|s| s.metadata.get(META_JOB_NAME).unwrap().as_str())
1238            .collect();
1239        assert!(names.contains(&"build"), "got: {names:?}");
1240        assert!(names.contains(&"deploy"), "got: {names:?}");
1241    }
1242
1243    #[test]
1244    fn reserved_keywords_not_parsed_as_jobs() {
1245        let yaml = r#"
1246stages:
1247  - build
1248  - test
1249
1250variables:
1251  MY_VAR: value
1252
1253image: alpine:latest
1254
1255build:
1256  stage: build
1257  script:
1258    - make
1259"#;
1260        let graph = parse(yaml);
1261        let steps: Vec<_> = graph.nodes_of_kind(NodeKind::Step).collect();
1262        assert_eq!(
1263            steps.len(),
1264            1,
1265            "only 'build' should be a Step, got: {:?}",
1266            steps.iter().map(|s| &s.name).collect::<Vec<_>>()
1267        );
1268        assert_eq!(steps[0].name, "build");
1269    }
1270
1271    #[test]
1272    fn services_emit_image_nodes() {
1273        let yaml = r#"
1274test:
1275  services:
1276    - docker:dind
1277    - name: postgres:14
1278  script:
1279    - run_tests
1280"#;
1281        let graph = parse(yaml);
1282        let images: Vec<_> = graph.nodes_of_kind(NodeKind::Image).collect();
1283        assert_eq!(
1284            images.len(),
1285            2,
1286            "expected 2 service Image nodes, got: {:?}",
1287            images.iter().map(|i| &i.name).collect::<Vec<_>>()
1288        );
1289    }
1290
1291    // ── Cross-platform misclassification trap (red-team R2 #5) ─────
1292
1293    #[test]
1294    fn job_carrier_with_unparseable_bodies_marks_partial() {
1295        // Top-level keys that look like job names but whose values are not
1296        // mappings (lists, scalars). GitLab parser would normally produce a
1297        // Step per non-reserved mapping-valued key; here every candidate is
1298        // skipped because the value is not a mapping. Result: 0 step nodes
1299        // despite a non-empty job carrier — must mark Partial.
1300        let yaml = r#"
1301build:
1302  - this is a list, not a mapping
1303test:
1304  - also a list
1305"#;
1306        let graph = parse(yaml);
1307        let step_count = graph
1308            .nodes
1309            .iter()
1310            .filter(|n| n.kind == NodeKind::Step)
1311            .count();
1312        // Note: the "had_job_carrier" heuristic only fires when the value IS
1313        // a mapping, so this case (non-mapping values) does NOT trigger the
1314        // partial — that's intentional. The heuristic targets the trap where
1315        // an attacker uses a *valid mapping shape* the GitLab parser can't
1316        // interpret.
1317        assert_eq!(step_count, 0);
1318        assert_eq!(
1319            graph.completeness,
1320            AuthorityCompleteness::Complete,
1321            "non-mapping values are not job carriers"
1322        );
1323    }
1324
1325    #[test]
1326    fn mapping_jobs_without_recognisable_step_content_marks_partial() {
1327        // A non-reserved top-level key whose value is a mapping but contains
1328        // only ADO-style fields (`task:`, `azureSubscription`) — and `extends`
1329        // marks the job as partial without creating a Step. Wait: the GitLab
1330        // parser actually still adds a Step node for any mapping-valued
1331        // non-reserved key. So to get the 0-step + had_carrier shape, we
1332        // need a hidden/template job (starts with '.') as the only candidate.
1333        let yaml = r#"
1334.template-only:
1335  script:
1336    - echo "this is a template-only file"
1337"#;
1338        let graph = parse(yaml);
1339        let step_count = graph
1340            .nodes
1341            .iter()
1342            .filter(|n| n.kind == NodeKind::Step)
1343            .count();
1344        assert_eq!(step_count, 0);
1345        // Hidden jobs already mark partial with their own reason.
1346        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
1347    }
1348}