Skip to main content

taudit_parse_gitlab/
lib.rs

1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4use serde_yaml::Value;
5use taudit_core::error::TauditError;
6use taudit_core::graph::*;
7// Re-import explicitly to make the new constants visible at a glance.
8#[allow(unused_imports)]
9use taudit_core::graph::{META_DOTENV_FILE, META_ENVIRONMENT_NAME, META_NEEDS, META_SCRIPT_BODY};
10use taudit_core::ports::PipelineParser;
11
12/// GitLab CI YAML parser.
13///
14/// Parses `.gitlab-ci.yml` files into an `AuthorityGraph`. The authority model:
15/// - Each job is a `Step` node.
16/// - `CI_JOB_TOKEN` is a global implicit `Identity` (always present, scope=broad).
17/// - `secrets:` entries emit `Secret` nodes with `HasAccessTo` edges.
18/// - `id_tokens:` entries emit OIDC `Identity` nodes.
19/// - `variables:` entries with credential-pattern names emit `Secret` nodes.
20/// - `image:` and `services:` emit `Image` nodes with `UsesImage` edges.
21/// - `include:` and `extends:` mark the graph `Partial`.
22/// - `rules: if: merge_request_event` and `only: merge_requests` set `META_TRIGGER`.
23pub struct GitlabParser;
24
25/// Reserved top-level keys that are not job definitions.
26const RESERVED: &[&str] = &[
27    "stages",
28    "workflow",
29    "include",
30    "variables",
31    "image",
32    "services",
33    "default",
34    "cache",
35    "before_script",
36    "after_script",
37    "types",
38];
39
40/// Variable name fragments that indicate a credential rather than plain config.
41const CRED_FRAGMENTS: &[&str] = &[
42    "TOKEN",
43    "SECRET",
44    "PASSWORD",
45    "PASSWD",
46    "PRIVATE_KEY",
47    "API_KEY",
48    "APIKEY",
49    "SIGNING_KEY",
50    "ACCESS_KEY",
51    "SERVICE_ACCOUNT",
52    "CERT",
53    "CREDENTIAL",
54];
55
56impl PipelineParser for GitlabParser {
57    fn platform(&self) -> &str {
58        "gitlab-ci"
59    }
60
61    fn parse(&self, content: &str, source: &PipelineSource) -> Result<AuthorityGraph, TauditError> {
62        let mut de = serde_yaml::Deserializer::from_str(content);
63        let doc = de
64            .next()
65            .ok_or_else(|| TauditError::Parse("empty YAML document".into()))?;
66        let root: Value = Value::deserialize(doc)
67            .map_err(|e| TauditError::Parse(format!("YAML parse error: {e}")))?;
68
69        let mapping = root
70            .as_mapping()
71            .ok_or_else(|| TauditError::Parse("GitLab CI root must be a mapping".into()))?;
72
73        let mut graph = AuthorityGraph::new(source.clone());
74        graph.metadata.insert(META_PLATFORM.into(), "gitlab".into());
75
76        // CI_JOB_TOKEN is always present in every GitLab CI job — it's the built-in
77        // platform token, equivalent to ADO's System.AccessToken or GHA's GITHUB_TOKEN.
78        let mut meta = HashMap::new();
79        meta.insert(META_IDENTITY_SCOPE.into(), "broad".into());
80        meta.insert(META_IMPLICIT.into(), "true".into());
81        let token_id = graph.add_node_with_metadata(
82            NodeKind::Identity,
83            "CI_JOB_TOKEN",
84            TrustZone::FirstParty,
85            meta,
86        );
87
88        // Top-level include: → mark Partial immediately AND capture each
89        // entry's structure as graph metadata so include-pinning rules can
90        // reason about remote URLs and unpinned project refs.
91        if let Some(inc) = mapping.get("include") {
92            graph.mark_partial(
93                GapKind::Expression,
94                "include: directive present — included templates not resolved".to_string(),
95            );
96            let entries = extract_include_entries(inc);
97            if !entries.is_empty() {
98                if let Ok(json) = serde_json::to_string(&entries) {
99                    graph.metadata.insert(META_GITLAB_INCLUDES.into(), json);
100                }
101            }
102        }
103
104        // Global variables
105        let global_secrets = process_variables(mapping.get("variables"), &mut graph, "pipeline");
106
107        // Global image
108        let global_image = mapping.get("image").and_then(extract_image_str);
109
110        // Top-level merge_request trigger detection from `workflow:` rules
111        if let Some(wf) = mapping.get("workflow") {
112            if has_mr_trigger_in_workflow(wf) {
113                graph
114                    .metadata
115                    .insert(META_TRIGGER.into(), "merge_request".into());
116            }
117        }
118
119        // Process each job (any top-level key not in RESERVED)
120        for (key, value) in mapping {
121            let job_name = match key.as_str() {
122                Some(k) => k,
123                None => continue,
124            };
125            if RESERVED.contains(&job_name) {
126                continue;
127            }
128
129            // Hidden jobs (starting with a dot) are templates — mark Partial, skip
130            if job_name.starts_with('.') {
131                graph.mark_partial(
132                    GapKind::Expression,
133                    format!("job '{job_name}' is a hidden/template job — not resolved"),
134                );
135                continue;
136            }
137
138            let job_map = match value.as_mapping() {
139                Some(m) => m,
140                None => continue,
141            };
142
143            // extends: — job template inheritance, can't resolve statically
144            let extends_names = extract_extends_list(job_map.get("extends"));
145            if !extends_names.is_empty() {
146                graph.mark_partial(
147                    GapKind::Expression,
148                    format!(
149                        "job '{job_name}' uses extends: — inherited configuration not resolved"
150                    ),
151                );
152            }
153
154            // Detect PR/MR trigger in this job's rules: or only:
155            let job_triggers_mr = job_has_mr_trigger(job_map);
156
157            // Propagate job MR trigger to graph level
158            if job_triggers_mr && !graph.metadata.contains_key(META_TRIGGER) {
159                graph
160                    .metadata
161                    .insert(META_TRIGGER.into(), "merge_request".into());
162            }
163
164            // Job-level variables
165            let job_secrets = process_variables(job_map.get("variables"), &mut graph, job_name);
166
167            // Job-level explicit secrets: (Vault, AWS Secrets Manager, GCP, Azure)
168            let explicit_secrets =
169                process_explicit_secrets(job_map.get("secrets"), job_name, &mut graph);
170
171            // Job-level OIDC tokens (id_tokens:)
172            let oidc_identities = process_id_tokens(job_map.get("id_tokens"), job_name, &mut graph);
173
174            // Job image (falls back to global)
175            let job_image_str = job_map
176                .get("image")
177                .and_then(extract_image_str)
178                .or(global_image.as_deref().map(String::from));
179
180            let image_id = job_image_str.as_deref().map(|img| {
181                let pinned = is_docker_digest_pinned(img);
182                let trust_zone = if pinned {
183                    TrustZone::ThirdParty
184                } else {
185                    TrustZone::Untrusted
186                };
187                let mut imeta = HashMap::new();
188                if let Some(digest) = img.split("@sha256:").nth(1) {
189                    imeta.insert(META_DIGEST.into(), format!("sha256:{digest}"));
190                }
191                graph.add_node_with_metadata(NodeKind::Image, img, trust_zone, imeta)
192            });
193
194            // Services (each is an Image node)
195            let service_ids = process_services(job_map.get("services"), &mut graph);
196
197            // Environment — record name as metadata, sets trust boundary marker
198            let env_name = job_map
199                .get("environment")
200                .and_then(extract_environment_name);
201            let env_url = job_map.get("environment").and_then(extract_environment_url);
202
203            // Concatenated script body (before_script + script + after_script).
204            // Stamped on the Step node so script-aware rules (notably
205            // `untrusted_ci_var_in_shell_interpolation` and
206            // `ci_job_token_to_external_api`) can pattern-match without
207            // re-walking the YAML.
208            // Inline script body — concatenate before_script, script, after_script
209            // (each may be a string or a list-of-strings). Stamped on the Step so
210            // script-aware rules can pattern-match without re-parsing YAML.
211            let script_body = extract_script_body(job_map);
212
213            // GitLab `artifacts.reports.dotenv: <file>` — when set, the file's
214            // KEY=value lines are silently promoted to pipeline variables for
215            // any downstream job that consumes this one via `needs:` /
216            // `dependencies:`. Required input to
217            // `dotenv_artifact_flows_to_privileged_deployment`.
218            let dotenv_file = extract_dotenv_file(job_map);
219
220            // Upstream job names consumed via `needs:` / `dependencies:`.
221            // Used to build dotenv-flow chains across stages.
222            let needs = extract_needs(job_map);
223
224            // Detect whether this job's `rules:` / `only:` clause restricts
225            // execution to protected branches (or to the default branch,
226            // which is protected by GitLab default policy). Used by the
227            // `gitlab_deploy_job_missing_protected_branch_only` rule to
228            // detect deployment jobs that lack any branch guard.
229            let protected_only = job_has_protected_branch_restriction(job_map);
230
231            // Create the Step node for this job
232            let mut step_meta = HashMap::new();
233            step_meta.insert(META_JOB_NAME.into(), job_name.to_string());
234            if let Some(ref env) = env_name {
235                step_meta.insert(META_ENVIRONMENT_NAME.into(), env.clone());
236            }
237            if !script_body.is_empty() {
238                step_meta.insert(META_SCRIPT_BODY.into(), script_body);
239            }
240            if let Some(ref f) = dotenv_file {
241                step_meta.insert(META_DOTENV_FILE.into(), f.clone());
242            }
243            if !needs.is_empty() {
244                step_meta.insert(META_NEEDS.into(), needs.join(","));
245            }
246            if let Some(ref url) = env_url {
247                step_meta.insert(META_ENVIRONMENT_URL.into(), url.clone());
248            }
249            // Per-step MR trigger marker — graph-level META_TRIGGER applies to
250            // the file as a whole, but `id_token_audience_overscoped` needs to
251            // compare audience usage between MR-context and protected-context
252            // jobs in the same file.
253            if job_triggers_mr {
254                step_meta.insert(META_TRIGGER.into(), "merge_request".into());
255            }
256            // extends: list (comma-joined, in source order)
257            if !extends_names.is_empty() {
258                step_meta.insert(META_GITLAB_EXTENDS.into(), extends_names.join(","));
259            }
260            // allow_failure: true|false (only stamp when explicitly set so the
261            // rule can distinguish "absent" from "false")
262            if let Some(af) = job_map.get("allow_failure").and_then(|v| v.as_bool()) {
263                step_meta.insert(META_GITLAB_ALLOW_FAILURE.into(), af.to_string());
264            } else if job_map
265                .get("allow_failure")
266                .and_then(|v| v.as_mapping())
267                .is_some()
268            {
269                // `allow_failure: { exit_codes: [42] }` — conditional pass; treat
270                // as truthy for silent-skip detection.
271                step_meta.insert(META_GITLAB_ALLOW_FAILURE.into(), "true".into());
272            }
273            // dind sidecar detection: any service whose name matches docker:*-dind
274            if job_services_have_dind(job_map.get("services")) {
275                step_meta.insert(META_GITLAB_DIND_SERVICE.into(), "true".into());
276            }
277            // trigger: block — child / downstream pipeline
278            if let Some(kind) = classify_trigger(job_map.get("trigger")) {
279                step_meta.insert(META_GITLAB_TRIGGER_KIND.into(), kind.into());
280            }
281            // cache: structural capture (key + policy)
282            if let Some((cache_key, cache_policy)) = extract_cache_key_policy(job_map.get("cache"))
283            {
284                step_meta.insert(META_GITLAB_CACHE_KEY.into(), cache_key);
285                if let Some(p) = cache_policy {
286                    step_meta.insert(META_GITLAB_CACHE_POLICY.into(), p);
287                }
288            }
289            if protected_only {
290                step_meta.insert(META_RULES_PROTECTED_ONLY.into(), "true".into());
291            }
292            let step_id = graph.add_node_with_metadata(
293                NodeKind::Step,
294                job_name,
295                TrustZone::FirstParty,
296                step_meta,
297            );
298
299            // CI_JOB_TOKEN always available to every step
300            graph.add_edge(step_id, token_id, EdgeKind::HasAccessTo);
301
302            // Link all secrets
303            for &sid in global_secrets
304                .iter()
305                .chain(&job_secrets)
306                .chain(&explicit_secrets)
307            {
308                graph.add_edge(step_id, sid, EdgeKind::HasAccessTo);
309            }
310
311            // Link OIDC identities
312            for &iid in &oidc_identities {
313                graph.add_edge(step_id, iid, EdgeKind::HasAccessTo);
314            }
315
316            // UsesImage edges
317            if let Some(img_id) = image_id {
318                graph.add_edge(step_id, img_id, EdgeKind::UsesImage);
319            }
320            for &svc_id in &service_ids {
321                graph.add_edge(step_id, svc_id, EdgeKind::UsesImage);
322            }
323        }
324
325        // Cross-platform misclassification trap (red-team R2 #5): a YAML file
326        // with non-reserved top-level keys looks like a GitLab pipeline shape
327        // but its body may use constructs the GitLab parser doesn't recognise
328        // (e.g. an ADO `task:` payload). Mark Partial when the source had at
329        // least one job-shaped top-level key but we ended up with no Step
330        // nodes — better than silently returning completeness=complete on a
331        // clean-but-empty graph that a CI gate would treat as "passed".
332        let step_count = graph
333            .nodes
334            .iter()
335            .filter(|n| n.kind == NodeKind::Step)
336            .count();
337        let had_job_carrier = mapping.iter().any(|(k, v)| {
338            k.as_str()
339                .map(|name| !RESERVED.contains(&name) && !name.starts_with('.'))
340                .unwrap_or(false)
341                && v.as_mapping().is_some()
342        });
343        if step_count == 0 && had_job_carrier {
344            graph.mark_partial(
345                GapKind::Expression,
346                "non-reserved top-level keys parsed but produced 0 step nodes — possible non-GitLab YAML wrong-platform-classified".to_string(),
347            );
348        }
349
350        graph.stamp_edge_authority_summaries();
351        Ok(graph)
352    }
353}
354
355/// Detect `image:` string from a YAML value — can be a bare string or a mapping with `name:`.
356fn extract_image_str(v: &Value) -> Option<String> {
357    match v {
358        Value::String(s) => Some(s.clone()),
359        Value::Mapping(m) => m.get("name").and_then(|n| n.as_str()).map(String::from),
360        _ => None,
361    }
362}
363
364/// Extract environment name from `environment:` value (string or mapping).
365fn extract_environment_name(v: &Value) -> Option<String> {
366    match v {
367        Value::String(s) => Some(s.clone()),
368        Value::Mapping(m) => m.get("name").and_then(|n| n.as_str()).map(String::from),
369        _ => None,
370    }
371}
372
373/// Extract `environment:url:` value (only present when environment is a mapping).
374fn extract_environment_url(v: &Value) -> Option<String> {
375    match v {
376        Value::Mapping(m) => m.get("url").and_then(|u| u.as_str()).map(String::from),
377        _ => None,
378    }
379}
380
381/// Concatenate `before_script`, `script`, and `after_script` of a job into one
382/// string body (separated by newlines). Each section may be a single string or
383/// a list of strings. Empty sections are skipped.
384fn extract_script_body(job_map: &serde_yaml::Mapping) -> String {
385    let mut lines: Vec<String> = Vec::new();
386    for key in &["before_script", "script", "after_script"] {
387        if let Some(v) = job_map.get(*key) {
388            collect_script_lines(v, &mut lines);
389        }
390    }
391    lines.join("\n")
392}
393
394/// Append script lines from a YAML value (string or sequence of strings).
395fn collect_script_lines(v: &Value, out: &mut Vec<String>) {
396    match v {
397        Value::String(s) => out.push(s.clone()),
398        Value::Sequence(seq) => {
399            for item in seq {
400                if let Some(s) = item.as_str() {
401                    out.push(s.to_string());
402                }
403            }
404        }
405        _ => {}
406    }
407}
408
409/// Extract `artifacts.reports.dotenv` filename. Value may be a single string
410/// or a list of strings — for the list form we join with `,`.
411fn extract_dotenv_file(job_map: &serde_yaml::Mapping) -> Option<String> {
412    let dotenv = job_map
413        .get("artifacts")?
414        .as_mapping()?
415        .get("reports")?
416        .as_mapping()?
417        .get("dotenv")?;
418    match dotenv {
419        Value::String(s) => Some(s.clone()),
420        Value::Sequence(seq) => {
421            let parts: Vec<String> = seq
422                .iter()
423                .filter_map(|v| v.as_str().map(String::from))
424                .collect();
425            if parts.is_empty() {
426                None
427            } else {
428                Some(parts.join(","))
429            }
430        }
431        _ => None,
432    }
433}
434
435/// Extract upstream job names from `needs:` and `dependencies:`.
436/// `needs:` may be a list of strings or a list of mappings with `job:`.
437/// `dependencies:` is a list of strings.
438fn extract_needs(job_map: &serde_yaml::Mapping) -> Vec<String> {
439    let mut out: Vec<String> = Vec::new();
440    if let Some(needs) = job_map.get("needs").and_then(|v| v.as_sequence()) {
441        for item in needs {
442            match item {
443                Value::String(s) => out.push(s.clone()),
444                Value::Mapping(m) => {
445                    if let Some(j) = m.get("job").and_then(|j| j.as_str()) {
446                        out.push(j.to_string());
447                    }
448                }
449                _ => {}
450            }
451        }
452    }
453    if let Some(deps) = job_map.get("dependencies").and_then(|v| v.as_sequence()) {
454        for item in deps {
455            if let Some(s) = item.as_str() {
456                out.push(s.to_string());
457            }
458        }
459    }
460    out.sort();
461    out.dedup();
462    out
463}
464
465/// Classify a variable name as a credential by checking for common fragments.
466fn is_credential_name(name: &str) -> bool {
467    let upper = name.to_uppercase();
468    CRED_FRAGMENTS.iter().any(|frag| upper.contains(frag))
469}
470
471/// Parse `variables:` mapping and emit `Secret` nodes for credential-pattern names.
472/// Returns the list of created node IDs.
473fn process_variables(vars: Option<&Value>, graph: &mut AuthorityGraph, scope: &str) -> Vec<NodeId> {
474    let mut ids = Vec::new();
475    let map = match vars.and_then(|v| v.as_mapping()) {
476        Some(m) => m,
477        None => return ids,
478    };
479    for (k, _v) in map {
480        let name = match k.as_str() {
481            Some(s) => s,
482            None => continue,
483        };
484        if is_credential_name(name) {
485            let id = graph.add_node(NodeKind::Secret, name, TrustZone::FirstParty);
486            ids.push(id);
487            let _ = scope; // used for future scoped error messages
488        }
489    }
490    ids
491}
492
493/// Parse `secrets:` block and emit one `Secret` node per named secret.
494///
495/// GitLab CI `secrets:` format:
496/// ```yaml
497/// secrets:
498///   DATABASE_PASSWORD:
499///     vault: production/db/password@secret
500///   AWS_KEY:
501///     aws_secrets_manager:
502///       name: my-secret
503/// ```
504fn process_explicit_secrets(
505    secrets: Option<&Value>,
506    _scope: &str,
507    graph: &mut AuthorityGraph,
508) -> Vec<NodeId> {
509    let mut ids = Vec::new();
510    let map = match secrets.and_then(|v| v.as_mapping()) {
511        Some(m) => m,
512        None => return ids,
513    };
514    for (k, _v) in map {
515        let name = match k.as_str() {
516            Some(s) => s,
517            None => continue,
518        };
519        let id = graph.add_node(NodeKind::Secret, name, TrustZone::FirstParty);
520        ids.push(id);
521    }
522    ids
523}
524
525/// Parse `id_tokens:` block and emit one OIDC `Identity` node per token.
526///
527/// GitLab CI `id_tokens:` format:
528/// ```yaml
529/// id_tokens:
530///   SIGSTORE_ID_TOKEN:
531///     aud: sigstore
532///   AWS_OIDC_TOKEN:
533///     aud: https://sts.amazonaws.com
534/// ```
535fn process_id_tokens(
536    id_tokens: Option<&Value>,
537    _scope: &str,
538    graph: &mut AuthorityGraph,
539) -> Vec<NodeId> {
540    let mut ids = Vec::new();
541    let map = match id_tokens.and_then(|v| v.as_mapping()) {
542        Some(m) => m,
543        None => return ids,
544    };
545    for (k, v) in map {
546        let token_name = match k.as_str() {
547            Some(s) => s,
548            None => continue,
549        };
550        // Extract audience for labelling and as discrete metadata
551        // (rules like `id_token_audience_overscoped` need to compare audiences
552        // across jobs without re-parsing the label).
553        let aud = v
554            .as_mapping()
555            .and_then(|m| m.get("aud"))
556            .and_then(|a| a.as_str())
557            .unwrap_or("unknown");
558        let label = format!("{token_name} (aud={aud})");
559        let mut meta = HashMap::new();
560        meta.insert(META_OIDC.into(), "true".into());
561        meta.insert(META_IDENTITY_SCOPE.into(), "broad".into());
562        meta.insert(META_OIDC_AUDIENCE.into(), aud.to_string());
563        let id =
564            graph.add_node_with_metadata(NodeKind::Identity, label, TrustZone::FirstParty, meta);
565        ids.push(id);
566    }
567    ids
568}
569
570/// Parse `services:` block and emit `Image` nodes.
571fn process_services(services: Option<&Value>, graph: &mut AuthorityGraph) -> Vec<NodeId> {
572    let mut ids = Vec::new();
573    let list = match services.and_then(|v| v.as_sequence()) {
574        Some(s) => s,
575        None => return ids,
576    };
577    for item in list {
578        let img_str = match extract_image_str(item) {
579            Some(s) => s,
580            None => continue,
581        };
582        let pinned = is_docker_digest_pinned(&img_str);
583        let trust_zone = if pinned {
584            TrustZone::ThirdParty
585        } else {
586            TrustZone::Untrusted
587        };
588        let mut meta = HashMap::new();
589        if let Some(digest) = img_str.split("@sha256:").nth(1) {
590            meta.insert(META_DIGEST.into(), format!("sha256:{digest}"));
591        }
592        let id = graph.add_node_with_metadata(NodeKind::Image, &img_str, trust_zone, meta);
593        ids.push(id);
594    }
595    ids
596}
597
598/// Check whether a job's `rules:` or `only:` indicates it runs on merge requests.
599fn job_has_mr_trigger(job_map: &serde_yaml::Mapping) -> bool {
600    // rules: [{if: '$CI_PIPELINE_SOURCE == "merge_request_event"'}]
601    if let Some(rules) = job_map.get("rules").and_then(|v| v.as_sequence()) {
602        for rule in rules {
603            if let Some(if_expr) = rule
604                .as_mapping()
605                .and_then(|m| m.get("if"))
606                .and_then(|v| v.as_str())
607            {
608                if if_expr.contains("merge_request_event") {
609                    return true;
610                }
611            }
612        }
613    }
614    // only: [merge_requests] or only: {refs: [merge_requests]}
615    if let Some(only) = job_map.get("only") {
616        if only_has_merge_requests(only) {
617            return true;
618        }
619    }
620    false
621}
622
623/// Check `only:` value (sequence or mapping) for `merge_requests` entry.
624fn only_has_merge_requests(v: &Value) -> bool {
625    match v {
626        Value::Sequence(seq) => seq
627            .iter()
628            .any(|item| item.as_str() == Some("merge_requests")),
629        Value::Mapping(m) => {
630            if let Some(refs) = m.get("refs").and_then(|r| r.as_sequence()) {
631                return refs
632                    .iter()
633                    .any(|item| item.as_str() == Some("merge_requests"));
634            }
635            false
636        }
637        _ => false,
638    }
639}
640
641/// Returns true when a job's `rules:` or `only:` clause restricts execution
642/// to protected refs only. The set of accepted patterns is intentionally
643/// generous because the goal is to *credit* defensive intent, not to
644/// audit-grade verify that every protection actually exists in GitLab's
645/// branch-protection settings — that lives outside the YAML.
646///
647/// Patterns recognised as a protected-only restriction:
648///
649///   * any `rules: [{ if: ... $CI_COMMIT_REF_PROTECTED ... }]`
650///   * any `rules: [{ if: ... $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH ... }]`
651///     (default branch is GitLab-protected by default)
652///   * any `rules: [{ if: ... $CI_COMMIT_TAG ... }]` (tags are protected by default)
653///   * `only: [main]` / `only: [master]` / `only: tags`
654///   * `only: { refs: [main, /^release/.*/] }`
655///
656/// Hits any one of the above → true. Misses every one → false.
657fn job_has_protected_branch_restriction(job_map: &serde_yaml::Mapping) -> bool {
658    if let Some(rules) = job_map.get("rules").and_then(|v| v.as_sequence()) {
659        for rule in rules {
660            let Some(if_expr) = rule
661                .as_mapping()
662                .and_then(|m| m.get("if"))
663                .and_then(|v| v.as_str())
664            else {
665                continue;
666            };
667            if if_expr.contains("$CI_COMMIT_REF_PROTECTED")
668                || if_expr.contains("CI_COMMIT_REF_PROTECTED")
669            {
670                return true;
671            }
672            if if_expr.contains("$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH")
673                || if_expr.contains("$CI_DEFAULT_BRANCH == $CI_COMMIT_BRANCH")
674            {
675                return true;
676            }
677            if if_expr.contains("$CI_COMMIT_TAG") {
678                return true;
679            }
680        }
681    }
682    if let Some(only) = job_map.get("only") {
683        if only_lists_protected_ref(only) {
684            return true;
685        }
686    }
687    false
688}
689
690/// Check `only:` for protected/default-branch refs (`main`, `master`, `tags`,
691/// or a `refs:` list containing those). Conservative — does NOT include
692/// `merge_requests` (that's the opposite signal).
693fn only_lists_protected_ref(v: &Value) -> bool {
694    fn is_protected_ref(s: &str) -> bool {
695        matches!(s, "main" | "master" | "tags") || s.starts_with("/^release")
696    }
697    match v {
698        Value::String(s) => is_protected_ref(s.as_str()),
699        Value::Sequence(seq) => seq
700            .iter()
701            .any(|item| item.as_str().map(is_protected_ref).unwrap_or(false)),
702        Value::Mapping(m) => {
703            if let Some(refs) = m.get("refs").and_then(|r| r.as_sequence()) {
704                return refs
705                    .iter()
706                    .any(|item| item.as_str().map(is_protected_ref).unwrap_or(false));
707            }
708            false
709        }
710        _ => false,
711    }
712}
713
714/// Check top-level `workflow:` rules for MR trigger.
715fn has_mr_trigger_in_workflow(wf: &Value) -> bool {
716    let rules = match wf
717        .as_mapping()
718        .and_then(|m| m.get("rules"))
719        .and_then(|r| r.as_sequence())
720    {
721        Some(r) => r,
722        None => return false,
723    };
724    for rule in rules {
725        if let Some(if_expr) = rule
726            .as_mapping()
727            .and_then(|m| m.get("if"))
728            .and_then(|v| v.as_str())
729        {
730            if if_expr.contains("merge_request_event") {
731                return true;
732            }
733        }
734    }
735    false
736}
737
738/// Structured representation of a single `include:` entry.
739///
740/// Serialised into `AuthorityGraph::metadata[META_GITLAB_INCLUDES]` so that
741/// downstream rules (e.g. `unpinned_include_remote_or_branch_ref`) can analyse
742/// remote-URL pins, project refs, and missing `ref:` defaults without re-parsing
743/// the YAML.
744#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
745pub struct IncludeEntry {
746    /// Include kind: `local`, `remote`, `template`, `project`, `component`, or
747    /// `unknown` for shapes we don't recognise.
748    pub kind: String,
749    /// The path / URL / project string the include points at.
750    pub target: String,
751    /// The resolved `ref:` value. Empty string when the include omits a `ref:`
752    /// (defaults to HEAD on the source repo, which is itself a finding).
753    pub git_ref: String,
754}
755
756/// Parse the top-level `include:` value into a flat list of `IncludeEntry`s.
757///
758/// `include:` accepts five shapes — string, sequence-of-strings, sequence-of-mappings,
759/// sequence-of-strings-mixed-with-mappings, and a single mapping. Normalise all of
760/// them into one flat list so the rule layer doesn't have to.
761pub fn extract_include_entries(v: &Value) -> Vec<IncludeEntry> {
762    let mut out = Vec::new();
763    match v {
764        // `include: 'path/to/local.yml'` — sugar for a local include
765        Value::String(s) => {
766            out.push(IncludeEntry {
767                kind: classify_string_include(s).into(),
768                target: s.clone(),
769                git_ref: String::new(),
770            });
771        }
772        Value::Sequence(seq) => {
773            for item in seq {
774                match item {
775                    Value::String(s) => {
776                        out.push(IncludeEntry {
777                            kind: classify_string_include(s).into(),
778                            target: s.clone(),
779                            git_ref: String::new(),
780                        });
781                    }
782                    Value::Mapping(m) => {
783                        if let Some(e) = include_entry_from_mapping(m) {
784                            out.push(e);
785                        }
786                    }
787                    _ => {}
788                }
789            }
790        }
791        Value::Mapping(m) => {
792            if let Some(e) = include_entry_from_mapping(m) {
793                out.push(e);
794            }
795        }
796        _ => {}
797    }
798    out
799}
800
801/// Heuristic: a top-level `include:` string that looks like an HTTPS URL is a
802/// `remote:` include in shorthand form; everything else is a `local:` path.
803fn classify_string_include(s: &str) -> &'static str {
804    let lower = s.to_ascii_lowercase();
805    if lower.starts_with("http://") || lower.starts_with("https://") {
806        "remote"
807    } else {
808        "local"
809    }
810}
811
812/// Lift one of the four mapping forms (`local:`, `remote:`, `template:`,
813/// `project:`, `component:`) into an `IncludeEntry`. Returns None when the
814/// mapping has none of the recognised keys.
815fn include_entry_from_mapping(m: &serde_yaml::Mapping) -> Option<IncludeEntry> {
816    let str_at = |key: &str| {
817        m.get(key)
818            .and_then(|v| v.as_str())
819            .map(str::to_string)
820            .unwrap_or_default()
821    };
822    if let Some(s) = m.get("local").and_then(|v| v.as_str()) {
823        return Some(IncludeEntry {
824            kind: "local".into(),
825            target: s.to_string(),
826            git_ref: String::new(),
827        });
828    }
829    if let Some(s) = m.get("remote").and_then(|v| v.as_str()) {
830        return Some(IncludeEntry {
831            kind: "remote".into(),
832            target: s.to_string(),
833            git_ref: String::new(),
834        });
835    }
836    if let Some(s) = m.get("template").and_then(|v| v.as_str()) {
837        return Some(IncludeEntry {
838            kind: "template".into(),
839            target: s.to_string(),
840            git_ref: String::new(),
841        });
842    }
843    if let Some(s) = m.get("component").and_then(|v| v.as_str()) {
844        // GitLab CI/CD components: source@version → version is the pin
845        let (target, git_ref) = match s.rsplit_once('@') {
846            Some((path, ver)) => (path.to_string(), ver.to_string()),
847            None => (s.to_string(), String::new()),
848        };
849        return Some(IncludeEntry {
850            kind: "component".into(),
851            target,
852            git_ref,
853        });
854    }
855    if m.contains_key("project") {
856        let project = str_at("project");
857        // ref: may be missing → empty string indicates HEAD/default branch,
858        // which is itself a supply-chain finding.
859        let git_ref = str_at("ref");
860        return Some(IncludeEntry {
861            kind: "project".into(),
862            target: project,
863            git_ref,
864        });
865    }
866    None
867}
868
869/// Extract a flat list of template names from an `extends:` value.
870/// `extends:` accepts a single string or a sequence of strings.
871fn extract_extends_list(v: Option<&Value>) -> Vec<String> {
872    let v = match v {
873        Some(v) => v,
874        None => return Vec::new(),
875    };
876    match v {
877        Value::String(s) => vec![s.clone()],
878        Value::Sequence(seq) => seq
879            .iter()
880            .filter_map(|i| i.as_str().map(str::to_string))
881            .collect(),
882        _ => Vec::new(),
883    }
884}
885
886/// Returns true when any entry in `services:` has an image name matching
887/// `docker:*-dind` (or bare `docker:dind`). Recognises both shapes:
888/// `services: [docker:dind]` and `services: [{name: docker:dind}]`.
889fn job_services_have_dind(services: Option<&Value>) -> bool {
890    let list = match services.and_then(|v| v.as_sequence()) {
891        Some(s) => s,
892        None => return false,
893    };
894    for item in list {
895        let img = match extract_image_str(item) {
896            Some(s) => s,
897            None => continue,
898        };
899        if image_is_dind(&img) {
900            return true;
901        }
902    }
903    false
904}
905
906/// Match `docker:dind`, `docker:24.0-dind`, `docker:24-dind`,
907/// `docker:24.0.7-dind-rootless`, etc. The discriminator is a `docker:` prefix
908/// AND `dind` appearing somewhere in the tag.
909fn image_is_dind(image: &str) -> bool {
910    let lower = image.to_ascii_lowercase();
911    // Match the official docker dind images and their digest-pinned variants.
912    // Strip any `@sha256:...` suffix before checking the tag.
913    let bare = match lower.split_once('@') {
914        Some((b, _)) => b,
915        None => &lower,
916    };
917    if !bare.starts_with("docker:") && !bare.starts_with("docker/") {
918        return false;
919    }
920    bare.contains("dind")
921}
922
923/// Classify a `trigger:` block as either `static` (in-tree YAML / fixed
924/// downstream project) or `dynamic` (include from a previous job's artifact —
925/// dynamic child pipelines, the code-injection sink). Returns None when no
926/// `trigger:` block is present.
927fn classify_trigger(trigger: Option<&Value>) -> Option<&'static str> {
928    let t = trigger?;
929    // Shorthand: `trigger: my/downstream/project` → static
930    if t.is_string() {
931        return Some("static");
932    }
933    let m = t.as_mapping()?;
934    // Look at every `include:` entry under trigger; if ANY one references an
935    // `artifact:` field, the child pipeline is dynamic.
936    if let Some(inc) = m.get("include") {
937        if include_has_artifact_source(inc) {
938            return Some("dynamic");
939        }
940    }
941    Some("static")
942}
943
944/// Walk a `trigger.include:` value (string / sequence / mapping) and return
945/// true when any entry's mapping carries an `artifact:` key.
946fn include_has_artifact_source(v: &Value) -> bool {
947    match v {
948        Value::Mapping(m) => m.contains_key("artifact"),
949        Value::Sequence(seq) => seq.iter().any(|i| {
950            i.as_mapping()
951                .map(|m| m.contains_key("artifact"))
952                .unwrap_or(false)
953        }),
954        _ => false,
955    }
956}
957
958/// Extract `(cache.key, cache.policy)` from a job's `cache:` value. Returns
959/// `None` when no cache is declared. `cache:` may be a sequence of mappings
960/// (multiple caches); we capture the first key/policy pair so the rule layer
961/// has at least one signal — multi-cache analysis is left to a future
962/// extension.
963///
964/// `cache.key:` may be:
965/// - a string: `key: vendor`
966/// - a mapping: `key: { files: [Gemfile.lock] }` → captured as `files:Gemfile.lock,...`
967/// - a mapping with `prefix:` → captured as `prefix:<value>`
968fn extract_cache_key_policy(v: Option<&Value>) -> Option<(String, Option<String>)> {
969    let v = v?;
970    let m = match v {
971        Value::Mapping(m) => m,
972        Value::Sequence(seq) => {
973            // First cache wins — same heuristic used elsewhere.
974            return seq
975                .iter()
976                .find_map(|i| i.as_mapping().and_then(extract_cache_key_policy_map));
977        }
978        _ => return None,
979    };
980    extract_cache_key_policy_map(m)
981}
982
983fn extract_cache_key_policy_map(m: &serde_yaml::Mapping) -> Option<(String, Option<String>)> {
984    let key = match m.get("key") {
985        Some(Value::String(s)) => s.clone(),
986        Some(Value::Number(n)) => n.to_string(),
987        Some(Value::Bool(b)) => b.to_string(),
988        Some(Value::Mapping(km)) => {
989            let mut parts = Vec::new();
990            if let Some(prefix) = km.get("prefix").and_then(|v| v.as_str()) {
991                parts.push(format!("prefix:{prefix}"));
992            }
993            if let Some(files) = km.get("files").and_then(|v| v.as_sequence()) {
994                let names: Vec<String> = files
995                    .iter()
996                    .filter_map(|f| f.as_str().map(str::to_string))
997                    .collect();
998                if !names.is_empty() {
999                    parts.push(format!("files:{}", names.join(",")));
1000                }
1001            }
1002            if parts.is_empty() {
1003                String::new()
1004            } else {
1005                parts.join(";")
1006            }
1007        }
1008        _ => String::new(),
1009    };
1010    let policy = m.get("policy").and_then(|v| v.as_str()).map(str::to_string);
1011    Some((key, policy))
1012}
1013
1014#[cfg(test)]
1015mod tests {
1016    use super::*;
1017
1018    fn parse(yaml: &str) -> AuthorityGraph {
1019        let parser = GitlabParser;
1020        let source = PipelineSource {
1021            file: ".gitlab-ci.yml".into(),
1022            repo: None,
1023            git_ref: None,
1024            commit_sha: None,
1025        };
1026        parser.parse(yaml, &source).unwrap()
1027    }
1028
1029    #[test]
1030    fn ci_job_token_always_present() {
1031        let yaml = r#"
1032stages:
1033  - build
1034
1035build-job:
1036  stage: build
1037  script:
1038    - make build
1039"#;
1040        let graph = parse(yaml);
1041        let identities: Vec<_> = graph.nodes_of_kind(NodeKind::Identity).collect();
1042        assert_eq!(identities.len(), 1);
1043        assert_eq!(identities[0].name, "CI_JOB_TOKEN");
1044        assert_eq!(
1045            identities[0]
1046                .metadata
1047                .get(META_IMPLICIT)
1048                .map(String::as_str),
1049            Some("true")
1050        );
1051        assert_eq!(
1052            identities[0]
1053                .metadata
1054                .get(META_IDENTITY_SCOPE)
1055                .map(String::as_str),
1056            Some("broad")
1057        );
1058    }
1059
1060    #[test]
1061    fn global_credential_variable_emits_secret_node() {
1062        let yaml = r#"
1063variables:
1064  APP_VERSION: "1.0"
1065  DEPLOY_TOKEN: "$CI_DEPLOY_TOKEN"
1066
1067build-job:
1068  script:
1069    - make
1070"#;
1071        let graph = parse(yaml);
1072        let secrets: Vec<_> = graph.nodes_of_kind(NodeKind::Secret).collect();
1073        assert!(
1074            secrets.iter().any(|s| s.name == "DEPLOY_TOKEN"),
1075            "DEPLOY_TOKEN must emit a Secret node, got: {:?}",
1076            secrets.iter().map(|s| &s.name).collect::<Vec<_>>()
1077        );
1078        // Plain config variable must not emit Secret
1079        assert!(
1080            !secrets.iter().any(|s| s.name == "APP_VERSION"),
1081            "APP_VERSION must not emit a Secret node"
1082        );
1083    }
1084
1085    #[test]
1086    fn floating_image_emits_untrusted_image_node() {
1087        let yaml = r#"
1088deploy:
1089  image: alpine:latest
1090  script:
1091    - deploy.sh
1092"#;
1093        let graph = parse(yaml);
1094        let images: Vec<_> = graph.nodes_of_kind(NodeKind::Image).collect();
1095        assert_eq!(images.len(), 1);
1096        assert_eq!(images[0].name, "alpine:latest");
1097        assert_eq!(images[0].trust_zone, TrustZone::Untrusted);
1098    }
1099
1100    #[test]
1101    fn digest_pinned_image_is_third_party() {
1102        let yaml = r#"
1103deploy:
1104  image: "alpine@sha256:a5ac7e51b41094c92402da3b24376905380afc29a5ac7e51b41094c92402da3b"
1105  script:
1106    - deploy.sh
1107"#;
1108        let graph = parse(yaml);
1109        let images: Vec<_> = graph.nodes_of_kind(NodeKind::Image).collect();
1110        assert_eq!(images.len(), 1);
1111        assert_eq!(images[0].trust_zone, TrustZone::ThirdParty);
1112    }
1113
1114    #[test]
1115    fn id_tokens_emit_oidc_identity_nodes() {
1116        let yaml = r#"
1117deploy:
1118  id_tokens:
1119    SIGSTORE_ID_TOKEN:
1120      aud: sigstore
1121    AWS_OIDC_TOKEN:
1122      aud: https://sts.amazonaws.com
1123  script:
1124    - deploy.sh
1125"#;
1126        let graph = parse(yaml);
1127        let oidc: Vec<_> = graph
1128            .nodes_of_kind(NodeKind::Identity)
1129            .filter(|n| n.metadata.get(META_OIDC).map(String::as_str) == Some("true"))
1130            .collect();
1131        assert_eq!(
1132            oidc.len(),
1133            2,
1134            "expected 2 OIDC identity nodes, got: {:?}",
1135            oidc.iter().map(|n| &n.name).collect::<Vec<_>>()
1136        );
1137    }
1138
1139    #[test]
1140    fn explicit_secrets_emit_secret_nodes() {
1141        let yaml = r#"
1142deploy:
1143  secrets:
1144    DATABASE_PASSWORD:
1145      vault: production/db/password@secret
1146    AWS_KEY:
1147      aws_secrets_manager:
1148        name: my-secret
1149  script:
1150    - deploy.sh
1151"#;
1152        let graph = parse(yaml);
1153        let secrets: Vec<_> = graph.nodes_of_kind(NodeKind::Secret).collect();
1154        let names: Vec<_> = secrets.iter().map(|s| s.name.as_str()).collect();
1155        assert!(names.contains(&"DATABASE_PASSWORD"), "got: {names:?}");
1156        assert!(names.contains(&"AWS_KEY"), "got: {names:?}");
1157    }
1158
1159    #[test]
1160    fn rules_mr_trigger_sets_meta_trigger() {
1161        let yaml = r#"
1162test:
1163  rules:
1164    - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
1165  script:
1166    - run tests
1167"#;
1168        let graph = parse(yaml);
1169        assert_eq!(
1170            graph.metadata.get(META_TRIGGER).map(String::as_str),
1171            Some("merge_request"),
1172            "META_TRIGGER must be set to merge_request"
1173        );
1174    }
1175
1176    #[test]
1177    fn only_merge_requests_sets_meta_trigger() {
1178        let yaml = r#"
1179test:
1180  only:
1181    - merge_requests
1182  script:
1183    - run tests
1184"#;
1185        let graph = parse(yaml);
1186        assert_eq!(
1187            graph.metadata.get(META_TRIGGER).map(String::as_str),
1188            Some("merge_request")
1189        );
1190    }
1191
1192    #[test]
1193    fn include_marks_graph_partial() {
1194        let yaml = r#"
1195include:
1196  - local: '/templates/.base.yml'
1197
1198build:
1199  script:
1200    - make
1201"#;
1202        let graph = parse(yaml);
1203        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
1204    }
1205
1206    #[test]
1207    fn extends_marks_graph_partial() {
1208        let yaml = r#"
1209.base:
1210  script:
1211    - echo base
1212
1213my-job:
1214  extends: .base
1215  stage: build
1216"#;
1217        let graph = parse(yaml);
1218        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
1219    }
1220
1221    #[test]
1222    fn meta_job_name_set_on_step_nodes() {
1223        let yaml = r#"
1224build:
1225  script:
1226    - make
1227deploy:
1228  script:
1229    - deploy.sh
1230"#;
1231        let graph = parse(yaml);
1232        let steps: Vec<_> = graph.nodes_of_kind(NodeKind::Step).collect();
1233        assert_eq!(steps.len(), 2);
1234        for step in &steps {
1235            assert!(
1236                step.metadata.contains_key(META_JOB_NAME),
1237                "Step '{}' missing META_JOB_NAME",
1238                step.name
1239            );
1240        }
1241        // Verify job names are correct
1242        let names: Vec<_> = steps
1243            .iter()
1244            .map(|s| s.metadata.get(META_JOB_NAME).unwrap().as_str())
1245            .collect();
1246        assert!(names.contains(&"build"), "got: {names:?}");
1247        assert!(names.contains(&"deploy"), "got: {names:?}");
1248    }
1249
1250    #[test]
1251    fn reserved_keywords_not_parsed_as_jobs() {
1252        let yaml = r#"
1253stages:
1254  - build
1255  - test
1256
1257variables:
1258  MY_VAR: value
1259
1260image: alpine:latest
1261
1262build:
1263  stage: build
1264  script:
1265    - make
1266"#;
1267        let graph = parse(yaml);
1268        let steps: Vec<_> = graph.nodes_of_kind(NodeKind::Step).collect();
1269        assert_eq!(
1270            steps.len(),
1271            1,
1272            "only 'build' should be a Step, got: {:?}",
1273            steps.iter().map(|s| &s.name).collect::<Vec<_>>()
1274        );
1275        assert_eq!(steps[0].name, "build");
1276    }
1277
1278    #[test]
1279    fn services_emit_image_nodes() {
1280        let yaml = r#"
1281test:
1282  services:
1283    - docker:dind
1284    - name: postgres:14
1285  script:
1286    - run_tests
1287"#;
1288        let graph = parse(yaml);
1289        let images: Vec<_> = graph.nodes_of_kind(NodeKind::Image).collect();
1290        assert_eq!(
1291            images.len(),
1292            2,
1293            "expected 2 service Image nodes, got: {:?}",
1294            images.iter().map(|i| &i.name).collect::<Vec<_>>()
1295        );
1296    }
1297
1298    // ── Cross-platform misclassification trap (red-team R2 #5) ─────
1299
1300    #[test]
1301    fn job_carrier_with_unparseable_bodies_marks_partial() {
1302        // Top-level keys that look like job names but whose values are not
1303        // mappings (lists, scalars). GitLab parser would normally produce a
1304        // Step per non-reserved mapping-valued key; here every candidate is
1305        // skipped because the value is not a mapping. Result: 0 step nodes
1306        // despite a non-empty job carrier — must mark Partial.
1307        let yaml = r#"
1308build:
1309  - this is a list, not a mapping
1310test:
1311  - also a list
1312"#;
1313        let graph = parse(yaml);
1314        let step_count = graph
1315            .nodes
1316            .iter()
1317            .filter(|n| n.kind == NodeKind::Step)
1318            .count();
1319        // Note: the "had_job_carrier" heuristic only fires when the value IS
1320        // a mapping, so this case (non-mapping values) does NOT trigger the
1321        // partial — that's intentional. The heuristic targets the trap where
1322        // an attacker uses a *valid mapping shape* the GitLab parser can't
1323        // interpret.
1324        assert_eq!(step_count, 0);
1325        assert_eq!(
1326            graph.completeness,
1327            AuthorityCompleteness::Complete,
1328            "non-mapping values are not job carriers"
1329        );
1330    }
1331
1332    #[test]
1333    fn mapping_jobs_without_recognisable_step_content_marks_partial() {
1334        // A non-reserved top-level key whose value is a mapping but contains
1335        // only ADO-style fields (`task:`, `azureSubscription`) — and `extends`
1336        // marks the job as partial without creating a Step. Wait: the GitLab
1337        // parser actually still adds a Step node for any mapping-valued
1338        // non-reserved key. So to get the 0-step + had_carrier shape, we
1339        // need a hidden/template job (starts with '.') as the only candidate.
1340        let yaml = r#"
1341.template-only:
1342  script:
1343    - echo "this is a template-only file"
1344"#;
1345        let graph = parse(yaml);
1346        let step_count = graph
1347            .nodes
1348            .iter()
1349            .filter(|n| n.kind == NodeKind::Step)
1350            .count();
1351        assert_eq!(step_count, 0);
1352        // Hidden jobs already mark partial with their own reason.
1353        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
1354    }
1355}