Skip to main content

taudit_parse_gitlab/
lib.rs

1use std::collections::{HashMap, HashSet};
2
3use serde::{Deserialize, Serialize};
4use serde_yaml::Value;
5use taudit_core::error::TauditError;
6use taudit_core::graph::*;
7// Re-import explicitly to make the new constants visible at a glance.
8#[allow(unused_imports)]
9use taudit_core::graph::{META_DOTENV_FILE, META_ENVIRONMENT_NAME, META_NEEDS, META_SCRIPT_BODY};
10use taudit_core::ports::PipelineParser;
11
12/// GitLab CI YAML parser.
13///
14/// Parses `.gitlab-ci.yml` files into an `AuthorityGraph`. The authority model:
15/// - Each job is a `Step` node.
16/// - `CI_JOB_TOKEN` is a global implicit `Identity` (always present, scope=broad).
17/// - `secrets:` entries emit `Secret` nodes with `HasAccessTo` edges.
18/// - `id_tokens:` entries emit OIDC `Identity` nodes.
19/// - `variables:` entries with credential-pattern names emit `Secret` nodes.
20/// - `image:` and `services:` emit `Image` nodes with `UsesImage` edges.
21/// - `include:` and `extends:` mark the graph `Partial`.
22/// - `rules: if: merge_request_event` and `only: merge_requests` set `META_TRIGGER`.
23pub struct GitlabParser;
24
25/// Reserved top-level keys that are not job definitions.
26const RESERVED: &[&str] = &[
27    "stages",
28    "workflow",
29    "include",
30    "variables",
31    "image",
32    "services",
33    "default",
34    "cache",
35    "before_script",
36    "after_script",
37    "types",
38];
39
40/// Variable name fragments that indicate a credential rather than plain config.
41const CRED_FRAGMENTS: &[&str] = &[
42    "TOKEN",
43    "SECRET",
44    "PASSWORD",
45    "PASSWD",
46    "PRIVATE_KEY",
47    "API_KEY",
48    "APIKEY",
49    "SIGNING_KEY",
50    "ACCESS_KEY",
51    "SERVICE_ACCOUNT",
52    "CERT",
53    "CREDENTIAL",
54];
55
56impl PipelineParser for GitlabParser {
57    fn platform(&self) -> &str {
58        "gitlab-ci"
59    }
60
61    fn parse(&self, content: &str, source: &PipelineSource) -> Result<AuthorityGraph, TauditError> {
62        let (parse_content, duplicate_recovery_note) = match parse_gitlab_yaml_value(content) {
63            Ok((root, extra_docs, first_doc_was_spec_header)) => {
64                let mut graph = build_graph_from_root(root, source)?;
65                if extra_docs {
66                    graph.mark_partial(
67                        GapKind::Expression,
68                        if first_doc_was_spec_header {
69                            "file contains GitLab spec: header plus executable config document — analyzed the executable document and preserved spec: as an unresolved header".to_string()
70                        } else {
71                            "file contains multiple YAML documents (--- separator) — only the first was analyzed".to_string()
72                        },
73                    );
74                }
75                return Ok(graph);
76            }
77            Err(e) if is_duplicate_key_parse_error(&e) => {
78                let sanitized = sanitize_duplicate_mapping_keys(content);
79                let note = format!(
80                    "GitLab YAML contained duplicate mapping keys; later duplicates were preserved as opaque __taudit_duplicate_* keys during recovery ({e})"
81                );
82                (sanitized, Some(note))
83            }
84            Err(e) => return Err(TauditError::Parse(format!("YAML parse error: {e}"))),
85        };
86
87        let (root, extra_docs, first_doc_was_spec_header) = parse_gitlab_yaml_value(&parse_content)
88            .map_err(|e| TauditError::Parse(format!("YAML parse error: {e}")))?;
89        let mut graph = build_graph_from_root(root, source)?;
90        if extra_docs {
91            graph.mark_partial(
92                GapKind::Expression,
93                if first_doc_was_spec_header {
94                    "file contains GitLab spec: header plus executable config document — analyzed the executable document and preserved spec: as an unresolved header".to_string()
95                } else {
96                    "file contains multiple YAML documents (--- separator) — only the first was analyzed".to_string()
97                },
98            );
99        }
100        if let Some(note) = duplicate_recovery_note {
101            graph.mark_partial(GapKind::Structural, note);
102        }
103        Ok(graph)
104    }
105}
106
107fn parse_gitlab_yaml_value(content: &str) -> Result<(Value, bool, bool), serde_yaml::Error> {
108    let mut de = serde_yaml::Deserializer::from_str(content);
109    let Some(doc) = de.next() else {
110        return Ok((Value::Null, false, false));
111    };
112    let first = Value::deserialize(doc)?;
113    let Some(second_doc) = de.next() else {
114        return Ok((first, false, false));
115    };
116    if gitlab_doc_is_spec_header(&first) {
117        return Ok((Value::deserialize(second_doc)?, true, true));
118    }
119    Ok((first, true, false))
120}
121
122fn gitlab_doc_is_spec_header(doc: &Value) -> bool {
123    let Some(map) = doc.as_mapping() else {
124        return false;
125    };
126    map.contains_key("spec")
127}
128
129fn is_duplicate_key_parse_error(error: &serde_yaml::Error) -> bool {
130    error.to_string().contains("duplicate entry with key")
131}
132
133fn sanitize_duplicate_mapping_keys(content: &str) -> String {
134    #[derive(Default)]
135    struct Frame {
136        indent: usize,
137        keys: HashSet<String>,
138    }
139
140    let mut out = Vec::new();
141    let mut frames: Vec<Frame> = Vec::new();
142    let mut duplicate_counts: HashMap<(usize, String), usize> = HashMap::new();
143    let mut block_scalar_indent: Option<usize> = None;
144
145    for line in content.lines() {
146        let indent = line.chars().take_while(|c| *c == ' ').count();
147        let trimmed = &line[indent..];
148
149        if let Some(block_indent) = block_scalar_indent {
150            if !trimmed.is_empty() && indent <= block_indent {
151                block_scalar_indent = None;
152            } else {
153                out.push(line.to_string());
154                continue;
155            }
156        }
157
158        if trimmed.is_empty() || trimmed.starts_with('#') {
159            out.push(line.to_string());
160            continue;
161        }
162
163        let (key_indent, key_start, key_end, key) = match yaml_mapping_key_span(line, indent) {
164            Some(parts) => parts,
165            None => {
166                out.push(line.to_string());
167                continue;
168            }
169        };
170
171        while frames.last().is_some_and(|frame| frame.indent > key_indent) {
172            frames.pop();
173        }
174        if !frames.iter().any(|frame| frame.indent == key_indent) {
175            frames.push(Frame {
176                indent: key_indent,
177                keys: HashSet::new(),
178            });
179        }
180        let frame = frames
181            .iter_mut()
182            .rev()
183            .find(|frame| frame.indent == key_indent)
184            .expect("frame inserted above");
185
186        if frame.keys.insert(key.clone()) {
187            out.push(line.to_string());
188        } else {
189            let count = duplicate_counts
190                .entry((key_indent, key.clone()))
191                .and_modify(|n| *n += 1)
192                .or_insert(2);
193            let replacement = format!(
194                "__taudit_duplicate_{}_{}",
195                sanitize_key_fragment(&key),
196                count
197            );
198            let mut rewritten = String::with_capacity(line.len() + replacement.len());
199            rewritten.push_str(&line[..key_start]);
200            rewritten.push_str(&replacement);
201            rewritten.push_str(&line[key_end..]);
202            out.push(rewritten);
203        }
204
205        let value_tail = line[key_end..].trim_start();
206        if value_tail.starts_with(": |") || value_tail.starts_with(": >") {
207            block_scalar_indent = Some(key_indent);
208        }
209    }
210
211    let mut sanitized = out.join("\n");
212    if content.ends_with('\n') {
213        sanitized.push('\n');
214    }
215    sanitized
216}
217
218fn yaml_mapping_key_span(line: &str, indent: usize) -> Option<(usize, usize, usize, String)> {
219    let trimmed = &line[indent..];
220    if trimmed.starts_with('#') {
221        return None;
222    }
223
224    let mut key_indent = indent;
225    let mut key_start = indent;
226    let key_text = if let Some(rest) = trimmed.strip_prefix("- ") {
227        key_indent = indent + 2;
228        key_start = indent + 2;
229        rest
230    } else {
231        trimmed
232    };
233
234    let mut in_single = false;
235    let mut in_double = false;
236    let mut bracket_depth = 0i32;
237    let mut prev = '\0';
238    for (offset, ch) in key_text.char_indices() {
239        match ch {
240            '\'' if !in_double => in_single = !in_single,
241            '"' if !in_single && prev != '\\' => in_double = !in_double,
242            '[' | '{' if !in_single && !in_double => bracket_depth += 1,
243            ']' | '}' if !in_single && !in_double => bracket_depth -= 1,
244            ':' if !in_single && !in_double && bracket_depth == 0 => {
245                let after = key_text[offset + ch.len_utf8()..].chars().next();
246                if after.is_some_and(|c| !c.is_whitespace()) {
247                    prev = ch;
248                    continue;
249                }
250                let raw = &key_text[..offset];
251                let key = raw.trim();
252                if key.is_empty() {
253                    return None;
254                }
255                let leading = raw.len() - raw.trim_start().len();
256                let trailing = raw.trim_end().len();
257                let start = key_start + leading;
258                let end = key_start + trailing;
259                return Some((key_indent, start, end, key.to_string()));
260            }
261            _ => {}
262        }
263        prev = ch;
264    }
265    None
266}
267
268fn sanitize_key_fragment(key: &str) -> String {
269    let mut out = String::new();
270    for c in key.chars() {
271        if c.is_ascii_alphanumeric() {
272            out.push(c.to_ascii_lowercase());
273        } else {
274            out.push('_');
275        }
276    }
277    while out.contains("__") {
278        out = out.replace("__", "_");
279    }
280    out.trim_matches('_').chars().take(48).collect::<String>()
281}
282
283fn build_graph_from_root(
284    root: Value,
285    source: &PipelineSource,
286) -> Result<AuthorityGraph, TauditError> {
287    let mapping = root
288        .as_mapping()
289        .ok_or_else(|| TauditError::Parse("GitLab CI root must be a mapping".into()))?;
290
291    let mut graph = AuthorityGraph::new(source.clone());
292    graph.metadata.insert(META_PLATFORM.into(), "gitlab".into());
293
294    // CI_JOB_TOKEN is always present in every GitLab CI job — it's the built-in
295    // platform token, equivalent to ADO's System.AccessToken or GHA's GITHUB_TOKEN.
296    let mut meta = HashMap::new();
297    meta.insert(META_IDENTITY_SCOPE.into(), "broad".into());
298    meta.insert(META_IMPLICIT.into(), "true".into());
299    let token_id = graph.add_node_with_metadata(
300        NodeKind::Identity,
301        "CI_JOB_TOKEN",
302        TrustZone::FirstParty,
303        meta,
304    );
305
306    // Top-level include: → mark Partial immediately AND capture each
307    // entry's structure as graph metadata so include-pinning rules can
308    // reason about remote URLs and unpinned project refs.
309    if let Some(inc) = mapping.get("include") {
310        graph.mark_partial(
311            GapKind::Structural,
312            "include: directive present — included templates not resolved".to_string(),
313        );
314        let entries = extract_include_entries(inc);
315        if !entries.is_empty() {
316            if let Ok(json) = serde_json::to_string(&entries) {
317                graph.metadata.insert(META_GITLAB_INCLUDES.into(), json);
318            }
319        }
320    }
321
322    // Global variables
323    let global_secrets = process_variables(mapping.get("variables"), &mut graph, "pipeline");
324
325    // Global image
326    let global_image = mapping.get("image").and_then(extract_image_str);
327
328    // Top-level merge_request trigger detection from `workflow:` rules
329    if let Some(wf) = mapping.get("workflow") {
330        if has_mr_trigger_in_workflow(wf) {
331            graph
332                .metadata
333                .insert(META_TRIGGER.into(), "merge_request".into());
334        }
335    }
336
337    // Process each job (any top-level key not in RESERVED)
338    // determinism: sort by key — same YAML must produce same NodeId order
339    let mut top_level_entries: Vec<(&Value, &Value)> = mapping.iter().collect();
340    top_level_entries.sort_by(|a, b| a.0.as_str().unwrap_or("").cmp(b.0.as_str().unwrap_or("")));
341    for (key, value) in top_level_entries {
342        let job_name = match key.as_str() {
343            Some(k) => k,
344            None => continue,
345        };
346        if RESERVED.contains(&job_name) {
347            continue;
348        }
349
350        // Hidden jobs (starting with a dot) are templates — mark Partial, skip
351        if job_name.starts_with('.') {
352            graph.mark_partial(
353                GapKind::Structural,
354                format!("job '{job_name}' is a hidden/template job — not resolved"),
355            );
356            continue;
357        }
358
359        let job_map = match value.as_mapping() {
360            Some(m) => m,
361            None => continue,
362        };
363
364        // extends: — job template inheritance, can't resolve statically
365        let extends_names = extract_extends_list(job_map.get("extends"));
366        if !extends_names.is_empty() {
367            graph.mark_partial(
368                GapKind::Structural,
369                format!("job '{job_name}' uses extends: — inherited configuration not resolved"),
370            );
371        }
372
373        // Detect PR/MR trigger in this job's rules: or only:
374        let job_triggers_mr = job_has_mr_trigger(job_map);
375
376        // Propagate job MR trigger to graph level
377        if job_triggers_mr && !graph.metadata.contains_key(META_TRIGGER) {
378            graph
379                .metadata
380                .insert(META_TRIGGER.into(), "merge_request".into());
381        }
382
383        // Job-level variables
384        let job_secrets = process_variables(job_map.get("variables"), &mut graph, job_name);
385
386        // Job-level explicit secrets: (Vault, AWS Secrets Manager, GCP, Azure)
387        let explicit_secrets =
388            process_explicit_secrets(job_map.get("secrets"), job_name, &mut graph);
389
390        // Job-level OIDC tokens (id_tokens:)
391        let oidc_identities = process_id_tokens(job_map.get("id_tokens"), job_name, &mut graph);
392
393        // Job image (falls back to global)
394        let job_image_str = job_map
395            .get("image")
396            .and_then(extract_image_str)
397            .or(global_image.as_deref().map(String::from));
398
399        let image_id = job_image_str.as_deref().map(|img| {
400            let pinned = is_docker_digest_pinned(img);
401            let trust_zone = if pinned {
402                TrustZone::ThirdParty
403            } else {
404                TrustZone::Untrusted
405            };
406            let mut imeta = HashMap::new();
407            if let Some(digest) = img.split("@sha256:").nth(1) {
408                imeta.insert(META_DIGEST.into(), format!("sha256:{digest}"));
409            }
410            graph.add_node_with_metadata(NodeKind::Image, img, trust_zone, imeta)
411        });
412
413        // Services (each is an Image node)
414        let service_ids = process_services(job_map.get("services"), &mut graph);
415
416        // Environment — record name as metadata, sets trust boundary marker
417        let env_name = job_map
418            .get("environment")
419            .and_then(extract_environment_name);
420        let env_url = job_map.get("environment").and_then(extract_environment_url);
421
422        // Concatenated script body (before_script + script + after_script).
423        // Stamped on the Step node so script-aware rules (notably
424        // `untrusted_ci_var_in_shell_interpolation` and
425        // `ci_job_token_to_external_api`) can pattern-match without
426        // re-walking the YAML.
427        // Inline script body — concatenate before_script, script, after_script
428        // (each may be a string or a list-of-strings). Stamped on the Step so
429        // script-aware rules can pattern-match without re-parsing YAML.
430        let script_body = extract_script_body(job_map);
431
432        // GitLab `artifacts.reports.dotenv: <file>` — when set, the file's
433        // KEY=value lines are silently promoted to pipeline variables for
434        // any downstream job that consumes this one via `needs:` /
435        // `dependencies:`. Required input to
436        // `dotenv_artifact_flows_to_privileged_deployment`.
437        let dotenv_file = extract_dotenv_file(job_map);
438
439        // Upstream job names consumed via `needs:` / `dependencies:`.
440        // Used to build dotenv-flow chains across stages.
441        let needs = extract_needs(job_map);
442
443        // Detect whether this job's `rules:` / `only:` clause restricts
444        // execution to protected branches (or to the default branch,
445        // which is protected by GitLab default policy). Used by the
446        // `gitlab_deploy_job_missing_protected_branch_only` rule to
447        // detect deployment jobs that lack any branch guard.
448        let protected_only = job_has_protected_branch_restriction(job_map);
449
450        // Create the Step node for this job
451        let mut step_meta = HashMap::new();
452        step_meta.insert(META_JOB_NAME.into(), job_name.to_string());
453        if let Some(ref env) = env_name {
454            step_meta.insert(META_ENVIRONMENT_NAME.into(), env.clone());
455        }
456        if !script_body.is_empty() {
457            step_meta.insert(META_SCRIPT_BODY.into(), script_body);
458        }
459        if let Some(ref f) = dotenv_file {
460            step_meta.insert(META_DOTENV_FILE.into(), f.clone());
461        }
462        if !needs.is_empty() {
463            step_meta.insert(META_NEEDS.into(), needs.join(","));
464        }
465        if let Some(ref url) = env_url {
466            step_meta.insert(META_ENVIRONMENT_URL.into(), url.clone());
467        }
468        // Per-step MR trigger marker — graph-level META_TRIGGER applies to
469        // the file as a whole, but `id_token_audience_overscoped` needs to
470        // compare audience usage between MR-context and protected-context
471        // jobs in the same file.
472        if job_triggers_mr {
473            step_meta.insert(META_TRIGGER.into(), "merge_request".into());
474        }
475        // extends: list (comma-joined, in source order)
476        if !extends_names.is_empty() {
477            step_meta.insert(META_GITLAB_EXTENDS.into(), extends_names.join(","));
478        }
479        // allow_failure: true|false (only stamp when explicitly set so the
480        // rule can distinguish "absent" from "false")
481        if let Some(af) = job_map.get("allow_failure").and_then(|v| v.as_bool()) {
482            step_meta.insert(META_GITLAB_ALLOW_FAILURE.into(), af.to_string());
483        } else if job_map
484            .get("allow_failure")
485            .and_then(|v| v.as_mapping())
486            .is_some()
487        {
488            // `allow_failure: { exit_codes: [42] }` — conditional pass; treat
489            // as truthy for silent-skip detection.
490            step_meta.insert(META_GITLAB_ALLOW_FAILURE.into(), "true".into());
491        }
492        // dind sidecar detection: any service whose name matches docker:*-dind
493        if job_services_have_dind(job_map.get("services")) {
494            step_meta.insert(META_GITLAB_DIND_SERVICE.into(), "true".into());
495        }
496        // trigger: block — child / downstream pipeline
497        if let Some(kind) = classify_trigger(job_map.get("trigger")) {
498            step_meta.insert(META_GITLAB_TRIGGER_KIND.into(), kind.into());
499        }
500        // cache: structural capture (key + policy)
501        if let Some((cache_key, cache_policy)) = extract_cache_key_policy(job_map.get("cache")) {
502            step_meta.insert(META_GITLAB_CACHE_KEY.into(), cache_key);
503            if let Some(p) = cache_policy {
504                step_meta.insert(META_GITLAB_CACHE_POLICY.into(), p);
505            }
506        }
507        if protected_only {
508            step_meta.insert(META_RULES_PROTECTED_ONLY.into(), "true".into());
509        }
510        let step_id = graph.add_node_with_metadata(
511            NodeKind::Step,
512            job_name,
513            TrustZone::FirstParty,
514            step_meta,
515        );
516
517        // CI_JOB_TOKEN always available to every step
518        graph.add_edge(step_id, token_id, EdgeKind::HasAccessTo);
519
520        // Link all secrets
521        for &sid in global_secrets
522            .iter()
523            .chain(&job_secrets)
524            .chain(&explicit_secrets)
525        {
526            graph.add_edge(step_id, sid, EdgeKind::HasAccessTo);
527        }
528
529        // Link OIDC identities
530        for &iid in &oidc_identities {
531            graph.add_edge(step_id, iid, EdgeKind::HasAccessTo);
532        }
533
534        // UsesImage edges
535        if let Some(img_id) = image_id {
536            graph.add_edge(step_id, img_id, EdgeKind::UsesImage);
537        }
538        for &svc_id in &service_ids {
539            graph.add_edge(step_id, svc_id, EdgeKind::UsesImage);
540        }
541    }
542
543    // Cross-platform misclassification trap (red-team R2 #5): a YAML file
544    // with non-reserved top-level keys looks like a GitLab pipeline shape
545    // but its body may use constructs the GitLab parser doesn't recognise
546    // (e.g. an ADO `task:` payload). Mark Partial when the source had at
547    // least one job-shaped top-level key but we ended up with no Step
548    // nodes — better than silently returning completeness=complete on a
549    // clean-but-empty graph that a CI gate would treat as "passed".
550    let step_count = graph
551        .nodes
552        .iter()
553        .filter(|n| n.kind == NodeKind::Step)
554        .count();
555    let had_job_carrier = mapping.iter().any(|(k, v)| {
556        k.as_str()
557            .map(|name| !RESERVED.contains(&name) && !name.starts_with('.'))
558            .unwrap_or(false)
559            && v.as_mapping().is_some()
560    });
561    if step_count == 0 && had_job_carrier {
562        graph.mark_partial(
563                GapKind::Opaque,
564                "non-reserved top-level keys parsed but produced 0 step nodes — possible non-GitLab YAML wrong-platform-classified".to_string(),
565            );
566    }
567
568    graph.stamp_edge_authority_summaries();
569    Ok(graph)
570}
571/// Detect `image:` string from a YAML value — can be a bare string or a mapping with `name:`.
572fn extract_image_str(v: &Value) -> Option<String> {
573    match v {
574        Value::String(s) => Some(s.clone()),
575        Value::Mapping(m) => m.get("name").and_then(|n| n.as_str()).map(String::from),
576        _ => None,
577    }
578}
579
580/// Extract environment name from `environment:` value (string or mapping).
581fn extract_environment_name(v: &Value) -> Option<String> {
582    match v {
583        Value::String(s) => Some(s.clone()),
584        Value::Mapping(m) => m.get("name").and_then(|n| n.as_str()).map(String::from),
585        _ => None,
586    }
587}
588
589/// Extract `environment:url:` value (only present when environment is a mapping).
590fn extract_environment_url(v: &Value) -> Option<String> {
591    match v {
592        Value::Mapping(m) => m.get("url").and_then(|u| u.as_str()).map(String::from),
593        _ => None,
594    }
595}
596
597/// Concatenate `before_script`, `script`, and `after_script` of a job into one
598/// string body (separated by newlines). Each section may be a single string or
599/// a list of strings. Empty sections are skipped.
600fn extract_script_body(job_map: &serde_yaml::Mapping) -> String {
601    let mut lines: Vec<String> = Vec::new();
602    for key in &["before_script", "script", "after_script"] {
603        if let Some(v) = job_map.get(*key) {
604            collect_script_lines(v, &mut lines);
605        }
606    }
607    lines.join("\n")
608}
609
610/// Append script lines from a YAML value (string or sequence of strings).
611fn collect_script_lines(v: &Value, out: &mut Vec<String>) {
612    match v {
613        Value::String(s) => out.push(s.clone()),
614        Value::Sequence(seq) => {
615            for item in seq {
616                if let Some(s) = item.as_str() {
617                    out.push(s.to_string());
618                }
619            }
620        }
621        _ => {}
622    }
623}
624
625/// Extract `artifacts.reports.dotenv` filename. Value may be a single string
626/// or a list of strings — for the list form we join with `,`.
627fn extract_dotenv_file(job_map: &serde_yaml::Mapping) -> Option<String> {
628    let dotenv = job_map
629        .get("artifacts")?
630        .as_mapping()?
631        .get("reports")?
632        .as_mapping()?
633        .get("dotenv")?;
634    match dotenv {
635        Value::String(s) => Some(s.clone()),
636        Value::Sequence(seq) => {
637            let parts: Vec<String> = seq
638                .iter()
639                .filter_map(|v| v.as_str().map(String::from))
640                .collect();
641            if parts.is_empty() {
642                None
643            } else {
644                Some(parts.join(","))
645            }
646        }
647        _ => None,
648    }
649}
650
651/// Extract upstream job names from `needs:` and `dependencies:`.
652/// `needs:` may be a list of strings or a list of mappings with `job:`.
653/// `dependencies:` is a list of strings.
654///
655/// F5: GitLab `needs:` entries support an `artifacts: false` opt-out that
656/// stops the upstream's artifacts (including its `dotenv` report) from
657/// flowing into this job. Excluding those entries here means the comma-joined
658/// `META_NEEDS` consumed by `dotenv_artifact_flows_to_privileged_deployment`
659/// only contains jobs whose artifacts genuinely flow — no rule-side change
660/// needed.
661fn extract_needs(job_map: &serde_yaml::Mapping) -> Vec<String> {
662    let mut out: Vec<String> = Vec::new();
663    if let Some(needs) = job_map.get("needs").and_then(|v| v.as_sequence()) {
664        for item in needs {
665            match item {
666                Value::String(s) => out.push(s.clone()),
667                Value::Mapping(m) => {
668                    let Some(j) = m.get("job").and_then(|j| j.as_str()) else {
669                        continue;
670                    };
671                    // `artifacts:` defaults to true when omitted. Only skip
672                    // when explicitly set to false — anything else (true,
673                    // missing, weird shape) keeps the dependency.
674                    let artifacts_disabled =
675                        m.get("artifacts").and_then(|v| v.as_bool()) == Some(false);
676                    if artifacts_disabled {
677                        continue;
678                    }
679                    out.push(j.to_string());
680                }
681                _ => {}
682            }
683        }
684    }
685    if let Some(deps) = job_map.get("dependencies").and_then(|v| v.as_sequence()) {
686        for item in deps {
687            if let Some(s) = item.as_str() {
688                out.push(s.to_string());
689            }
690        }
691    }
692    out.sort();
693    out.dedup();
694    out
695}
696
697/// Recognise the canonical "is `var` truthy?" shape inside a GitLab CI
698/// `rules: if:` expression. Returns:
699///
700/// * `Some(true)` — the expression positively asserts `var` is truthy
701///   (e.g. `$VAR == "true"`, `$VAR == true`, bare `$VAR`, or any of those
702///   joined to other clauses with `&&`).
703/// * `Some(false)` — the expression negates `var`'s truthiness
704///   (e.g. `$VAR != "true"`, `$VAR == "false"`, `$VAR == null`).
705/// * `None` — the shape isn't recognisable; caller MUST treat as "no positive
706///   signal" (i.e. do not stamp protected-only or merge_request_event metadata).
707///
708/// We deliberately keep this minimal — better to under-claim protection than
709/// over-claim it. Anything we don't understand returns `None`.
710///
711/// Boundary discipline: `var` matches only when it appears as a `$VAR` token
712/// surrounded by non-identifier chars (or string ends), so `$CI_COMMIT_TAG`
713/// does not silently match `$CI_COMMIT_TAG_MESSAGE`.
714fn check_truthy_comparison(expr: &str, var: &str) -> Option<bool> {
715    // Split on `||` first — if ANY top-level disjunct is positive, the
716    // whole expression is positive (any one matching clause makes the rule
717    // fire). For `&&`, all conjuncts must agree; if any conjunct contradicts
718    // the others, we fall back to None.
719    let trimmed = expr.trim();
720    if trimmed.is_empty() {
721        return None;
722    }
723
724    // Top-level `||` short-circuit: if any disjunct is positive, accept.
725    if let Some((lhs, rhs)) = split_top_level(trimmed, "||") {
726        let l = check_truthy_comparison(&lhs, var);
727        let r = check_truthy_comparison(&rhs, var);
728        return match (l, r) {
729            (Some(true), _) | (_, Some(true)) => Some(true),
730            (Some(false), Some(false)) => Some(false),
731            _ => None,
732        };
733    }
734    // Top-level `&&`: positive only if at least one conjunct is positive
735    // and none is explicitly negative. (A conjunct that doesn't mention
736    // `var` is None — neutral — so we treat it as non-blocking.)
737    if let Some((lhs, rhs)) = split_top_level(trimmed, "&&") {
738        let l = check_truthy_comparison(&lhs, var);
739        let r = check_truthy_comparison(&rhs, var);
740        return match (l, r) {
741            (Some(false), _) | (_, Some(false)) => Some(false),
742            (Some(true), _) | (_, Some(true)) => Some(true),
743            _ => None,
744        };
745    }
746
747    // No top-level boolean op — atomic comparison or bare reference.
748    classify_atom(trimmed, var)
749}
750
751/// Split `expr` at the first top-level (paren-depth zero, not inside a string)
752/// occurrence of `op`. Returns the left and right halves (without `op`).
753/// Returns `None` if `op` is not found at the top level.
754fn split_top_level(expr: &str, op: &str) -> Option<(String, String)> {
755    let bytes = expr.as_bytes();
756    let op_bytes = op.as_bytes();
757    let mut depth: i32 = 0;
758    let mut in_str: Option<u8> = None;
759    let mut in_regex = false;
760    let mut i = 0;
761    while i < bytes.len() {
762        let b = bytes[i];
763        // Track string literals (single + double quotes).
764        if let Some(q) = in_str {
765            if b == b'\\' && i + 1 < bytes.len() {
766                i += 2;
767                continue;
768            }
769            if b == q {
770                in_str = None;
771            }
772            i += 1;
773            continue;
774        }
775        if in_regex {
776            if b == b'\\' && i + 1 < bytes.len() {
777                i += 2;
778                continue;
779            }
780            if b == b'/' {
781                in_regex = false;
782            }
783            i += 1;
784            continue;
785        }
786        match b {
787            b'"' | b'\'' => {
788                in_str = Some(b);
789                i += 1;
790                continue;
791            }
792            b'/' => {
793                // A `/` after `=~` or `!~` starts a regex literal. Only enter
794                // regex mode when preceded (after whitespace) by `~`.
795                let mut j = i;
796                while j > 0 && bytes[j - 1].is_ascii_whitespace() {
797                    j -= 1;
798                }
799                if j > 0 && bytes[j - 1] == b'~' {
800                    in_regex = true;
801                    i += 1;
802                    continue;
803                }
804            }
805            b'(' => depth += 1,
806            b')' => depth -= 1,
807            _ => {}
808        }
809        if depth == 0
810            && i + op_bytes.len() <= bytes.len()
811            && &bytes[i..i + op_bytes.len()] == op_bytes
812        {
813            let lhs = expr[..i].to_string();
814            let rhs = expr[i + op_bytes.len()..].to_string();
815            return Some((lhs, rhs));
816        }
817        i += 1;
818    }
819    None
820}
821
822/// Classify an atomic (no `&&`/`||`) sub-expression against `var`.
823fn classify_atom(atom: &str, var: &str) -> Option<bool> {
824    let s = atom.trim().trim_matches('(').trim_matches(')').trim();
825    // Bare reference: the entire atom is `$VAR` (truthy iff variable is set
826    // and non-empty per GitLab semantics).
827    if s == var {
828        return Some(true);
829    }
830    // Look for `==` / `!=` and a literal RHS. Anything else (regex `=~`,
831    // arbitrary substring, multiple comparisons) → None.
832    let (op, lhs, rhs) = if let Some((l, r)) = s.split_once("==") {
833        ("==", l.trim(), r.trim())
834    } else if let Some((l, r)) = s.split_once("!=") {
835        ("!=", l.trim(), r.trim())
836    } else {
837        return None;
838    };
839    // The variable must appear on exactly one side; the other side is the
840    // literal we compare against.
841    let (lit, side_is_var) = if lhs == var {
842        (rhs, true)
843    } else if rhs == var {
844        (lhs, true)
845    } else {
846        // Neither side is the variable as a bare token — recognise also a
847        // few extremely common forms where the var has surrounding chars
848        // (e.g. quoted: `"$VAR" == "true"`) but otherwise bail.
849        let lhs_unq = lhs.trim_matches('"').trim_matches('\'');
850        let rhs_unq = rhs.trim_matches('"').trim_matches('\'');
851        if lhs_unq == var {
852            (rhs, true)
853        } else if rhs_unq == var {
854            (lhs, true)
855        } else {
856            return None;
857        }
858    };
859    let _ = side_is_var; // currently always true if we got here
860                         // Normalise the literal: strip optional surrounding quotes.
861    let lit_norm = lit
862        .trim_matches('"')
863        .trim_matches('\'')
864        .to_ascii_lowercase();
865    let truthy_lit = matches!(lit_norm.as_str(), "true" | "1");
866    let falsy_lit = matches!(lit_norm.as_str(), "false" | "null" | "" | "0");
867    match (op, truthy_lit, falsy_lit) {
868        ("==", true, _) => Some(true),
869        ("==", _, true) => Some(false),
870        ("!=", true, _) => Some(false),
871        ("!=", _, true) => Some(true),
872        // Comparison against an arbitrary string literal (e.g. a branch name
873        // for `$CI_COMMIT_BRANCH == "main"`) is not a truthy comparison —
874        // return None and let the caller fall through to other heuristics.
875        _ => None,
876    }
877}
878
879/// Classify a variable name as a credential by checking for common fragments.
880///
881/// Each fragment in `CRED_FRAGMENTS` must appear as a *segment* of the name
882/// (bounded by `_` or by the start/end of the string), NOT as a free-floating
883/// substring. This avoids false positives like `CERTAIN_FLAG` (matches `CERT`
884/// substring), `CERTIFICATE_PATH` (path config, not a credential),
885/// `TOKENIZER_VERSION` (matches `TOKEN`), and `UNCERTAIN`.
886///
887/// A multi-token fragment like `PRIVATE_KEY` matches when its full text appears
888/// at a segment boundary on both sides — i.e. surrounded by `_` or string ends.
889fn is_credential_name(name: &str) -> bool {
890    let upper = name.to_uppercase();
891    let bytes = upper.as_bytes();
892    CRED_FRAGMENTS.iter().any(|frag| {
893        let frag_bytes = frag.as_bytes();
894        let n = frag_bytes.len();
895        if bytes.len() < n {
896            return false;
897        }
898        // Slide the fragment across the name, accepting only segment-bounded matches.
899        for i in 0..=bytes.len() - n {
900            if &bytes[i..i + n] != frag_bytes {
901                continue;
902            }
903            let left_ok = i == 0 || bytes[i - 1] == b'_';
904            let right_ok = i + n == bytes.len() || bytes[i + n] == b'_';
905            if left_ok && right_ok {
906                return true;
907            }
908        }
909        false
910    })
911}
912
913/// Parse `variables:` mapping and emit `Secret` nodes for credential-pattern names.
914/// Returns the list of created node IDs.
915fn process_variables(vars: Option<&Value>, graph: &mut AuthorityGraph, scope: &str) -> Vec<NodeId> {
916    let mut ids = Vec::new();
917    let map = match vars.and_then(|v| v.as_mapping()) {
918        Some(m) => m,
919        None => return ids,
920    };
921    // determinism: sort by key — same YAML must produce same NodeId order
922    let mut entries: Vec<(&Value, &Value)> = map.iter().collect();
923    entries.sort_by(|a, b| a.0.as_str().unwrap_or("").cmp(b.0.as_str().unwrap_or("")));
924    for (k, _v) in entries {
925        let name = match k.as_str() {
926            Some(s) => s,
927            None => continue,
928        };
929        if is_credential_name(name) {
930            let id = graph.add_node(NodeKind::Secret, name, TrustZone::FirstParty);
931            ids.push(id);
932            let _ = scope; // used for future scoped error messages
933        }
934    }
935    ids
936}
937
938/// Parse `secrets:` block and emit one `Secret` node per named secret.
939///
940/// GitLab CI `secrets:` format:
941/// ```yaml
942/// secrets:
943///   DATABASE_PASSWORD:
944///     vault: production/db/password@secret
945///   AWS_KEY:
946///     aws_secrets_manager:
947///       name: my-secret
948/// ```
949fn process_explicit_secrets(
950    secrets: Option<&Value>,
951    _scope: &str,
952    graph: &mut AuthorityGraph,
953) -> Vec<NodeId> {
954    let mut ids = Vec::new();
955    let map = match secrets.and_then(|v| v.as_mapping()) {
956        Some(m) => m,
957        None => return ids,
958    };
959    // determinism: sort by key — same YAML must produce same NodeId order
960    let mut entries: Vec<(&Value, &Value)> = map.iter().collect();
961    entries.sort_by(|a, b| a.0.as_str().unwrap_or("").cmp(b.0.as_str().unwrap_or("")));
962    for (k, _v) in entries {
963        let name = match k.as_str() {
964            Some(s) => s,
965            None => continue,
966        };
967        let id = graph.add_node(NodeKind::Secret, name, TrustZone::FirstParty);
968        ids.push(id);
969    }
970    ids
971}
972
973/// Parse `id_tokens:` block and emit one OIDC `Identity` node per token.
974///
975/// GitLab CI `id_tokens:` format:
976/// ```yaml
977/// id_tokens:
978///   SIGSTORE_ID_TOKEN:
979///     aud: sigstore
980///   AWS_OIDC_TOKEN:
981///     aud: https://sts.amazonaws.com
982/// ```
983fn process_id_tokens(
984    id_tokens: Option<&Value>,
985    _scope: &str,
986    graph: &mut AuthorityGraph,
987) -> Vec<NodeId> {
988    let mut ids = Vec::new();
989    let map = match id_tokens.and_then(|v| v.as_mapping()) {
990        Some(m) => m,
991        None => return ids,
992    };
993    // determinism: sort by key — same YAML must produce same NodeId order
994    let mut entries: Vec<(&Value, &Value)> = map.iter().collect();
995    entries.sort_by(|a, b| a.0.as_str().unwrap_or("").cmp(b.0.as_str().unwrap_or("")));
996    for (k, v) in entries {
997        let token_name = match k.as_str() {
998            Some(s) => s,
999            None => continue,
1000        };
1001        // F3: GitLab supports list-form `aud: [a, b, c]` (multi-cloud broker —
1002        // strongest over-scoping signal). Previously `as_str()` on a sequence
1003        // returned None and we fell through to "unknown", silently blinding
1004        // every multi-aud rule. Handle both shapes explicitly.
1005        let aud_value = v.as_mapping().and_then(|m| m.get("aud"));
1006        let (aud_joined, is_list) = match aud_value {
1007            Some(Value::String(s)) => (s.clone(), false),
1008            Some(Value::Sequence(seq)) => {
1009                let parts: Vec<String> = seq
1010                    .iter()
1011                    .filter_map(|item| match item {
1012                        Value::String(s) => Some(s.clone()),
1013                        _ => None,
1014                    })
1015                    .collect();
1016                if parts.is_empty() {
1017                    ("unknown".into(), false)
1018                } else {
1019                    (parts.join(","), true)
1020                }
1021            }
1022            _ => ("unknown".into(), false),
1023        };
1024        let label = format!("{token_name} (aud={aud_joined})");
1025        let mut meta = HashMap::new();
1026        meta.insert(META_OIDC.into(), "true".into());
1027        meta.insert(META_IDENTITY_SCOPE.into(), "broad".into());
1028        // Backward-compat: keep the single-`aud` field populated. For the
1029        // list form it now holds the comma-joined string so existing
1030        // consumers see *something* rather than "unknown".
1031        meta.insert(META_OIDC_AUDIENCE.into(), aud_joined.clone());
1032        // New (F3): explicit "list form" marker. Only set on the multi-aud
1033        // path so downstream rules can distinguish single-aud vs multi-aud
1034        // configurations without parsing the comma-joined string.
1035        if is_list {
1036            meta.insert(META_OIDC_AUDIENCES.into(), aud_joined.clone());
1037        }
1038        let id =
1039            graph.add_node_with_metadata(NodeKind::Identity, label, TrustZone::FirstParty, meta);
1040        ids.push(id);
1041    }
1042    ids
1043}
1044
1045/// Parse `services:` block and emit `Image` nodes.
1046fn process_services(services: Option<&Value>, graph: &mut AuthorityGraph) -> Vec<NodeId> {
1047    let mut ids = Vec::new();
1048    let list = match services.and_then(|v| v.as_sequence()) {
1049        Some(s) => s,
1050        None => return ids,
1051    };
1052    for item in list {
1053        let img_str = match extract_image_str(item) {
1054            Some(s) => s,
1055            None => continue,
1056        };
1057        let pinned = is_docker_digest_pinned(&img_str);
1058        let trust_zone = if pinned {
1059            TrustZone::ThirdParty
1060        } else {
1061            TrustZone::Untrusted
1062        };
1063        let mut meta = HashMap::new();
1064        if let Some(digest) = img_str.split("@sha256:").nth(1) {
1065            meta.insert(META_DIGEST.into(), format!("sha256:{digest}"));
1066        }
1067        let id = graph.add_node_with_metadata(NodeKind::Image, &img_str, trust_zone, meta);
1068        ids.push(id);
1069    }
1070    ids
1071}
1072
1073/// Check whether a job's `rules:` or `only:` indicates it runs on merge requests.
1074fn job_has_mr_trigger(job_map: &serde_yaml::Mapping) -> bool {
1075    // rules: [{if: '$CI_PIPELINE_SOURCE == "merge_request_event"'}]
1076    if let Some(rules) = job_map.get("rules").and_then(|v| v.as_sequence()) {
1077        for rule in rules {
1078            if let Some(if_expr) = rule
1079                .as_mapping()
1080                .and_then(|m| m.get("if"))
1081                .and_then(|v| v.as_str())
1082            {
1083                // F2: MR-trigger only fires for the *positive* equality form.
1084                // `$CI_PIPELINE_SOURCE != "merge_request_event"` ("run except
1085                // on MRs") used to set META_TRIGGER=merge_request and pollute
1086                // every downstream MR-context rule.
1087                if matches_mr_event(if_expr) {
1088                    return true;
1089                }
1090            }
1091        }
1092    }
1093    // only: [merge_requests] or only: {refs: [merge_requests]}
1094    if let Some(only) = job_map.get("only") {
1095        if only_has_merge_requests(only) {
1096            return true;
1097        }
1098    }
1099    false
1100}
1101
1102/// Check `only:` value (sequence or mapping) for `merge_requests` entry.
1103fn only_has_merge_requests(v: &Value) -> bool {
1104    match v {
1105        Value::Sequence(seq) => seq
1106            .iter()
1107            .any(|item| item.as_str() == Some("merge_requests")),
1108        Value::Mapping(m) => {
1109            if let Some(refs) = m.get("refs").and_then(|r| r.as_sequence()) {
1110                return refs
1111                    .iter()
1112                    .any(|item| item.as_str() == Some("merge_requests"));
1113            }
1114            false
1115        }
1116        _ => false,
1117    }
1118}
1119
1120/// Returns true when a job's `rules:` or `only:` clause restricts execution
1121/// to protected refs only. The set of accepted patterns is intentionally
1122/// generous because the goal is to *credit* defensive intent, not to
1123/// audit-grade verify that every protection actually exists in GitLab's
1124/// branch-protection settings — that lives outside the YAML.
1125///
1126/// Patterns recognised as a protected-only restriction:
1127///
1128///   * any `rules: [{ if: ... $CI_COMMIT_REF_PROTECTED ... }]`
1129///   * any `rules: [{ if: ... $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH ... }]`
1130///     (default branch is GitLab-protected by default)
1131///   * any `rules: [{ if: ... $CI_COMMIT_TAG ... }]` (tags are protected by default)
1132///   * `only: [main]` / `only: [master]` / `only: tags`
1133///   * `only: { refs: [main, /^release/.*/] }`
1134///
1135/// Hits any one of the above → true. Misses every one → false.
1136fn job_has_protected_branch_restriction(job_map: &serde_yaml::Mapping) -> bool {
1137    if let Some(rules) = job_map.get("rules").and_then(|v| v.as_sequence()) {
1138        for rule in rules {
1139            let Some(if_expr) = rule
1140                .as_mapping()
1141                .and_then(|m| m.get("if"))
1142                .and_then(|v| v.as_str())
1143            else {
1144                continue;
1145            };
1146            // F1: `$CI_COMMIT_REF_PROTECTED` — only a *positive* assertion
1147            // ("ref IS protected") counts. `== "false"` or `!= "true"` is the
1148            // exact opposite signal and must NOT stamp protected-only.
1149            if matches!(
1150                check_truthy_comparison(if_expr, "$CI_COMMIT_REF_PROTECTED"),
1151                Some(true)
1152            ) {
1153                return true;
1154            }
1155            if if_expr.contains("$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH")
1156                || if_expr.contains("$CI_DEFAULT_BRANCH == $CI_COMMIT_BRANCH")
1157            {
1158                return true;
1159            }
1160            // F1: `$CI_COMMIT_TAG` — only the truthy form ("running on a
1161            // tag", which GitLab protects by default). Reject negations
1162            // (`== null`, `!= ...`) and avoid the substring-collision with
1163            // `$CI_COMMIT_TAG_MESSAGE` that the previous `contains()` had.
1164            if matches!(
1165                check_truthy_comparison(if_expr, "$CI_COMMIT_TAG"),
1166                Some(true)
1167            ) {
1168                return true;
1169            }
1170        }
1171    }
1172    if let Some(only) = job_map.get("only") {
1173        if only_lists_protected_ref(only) {
1174            return true;
1175        }
1176    }
1177    false
1178}
1179
1180/// Check `only:` for protected/default-branch refs (`main`, `master`, `tags`,
1181/// or a `refs:` list containing those). Conservative — does NOT include
1182/// `merge_requests` (that's the opposite signal).
1183fn only_lists_protected_ref(v: &Value) -> bool {
1184    fn is_protected_ref(s: &str) -> bool {
1185        matches!(s, "main" | "master" | "tags") || s.starts_with("/^release")
1186    }
1187    match v {
1188        Value::String(s) => is_protected_ref(s.as_str()),
1189        Value::Sequence(seq) => seq
1190            .iter()
1191            .any(|item| item.as_str().map(is_protected_ref).unwrap_or(false)),
1192        Value::Mapping(m) => {
1193            if let Some(refs) = m.get("refs").and_then(|r| r.as_sequence()) {
1194                return refs
1195                    .iter()
1196                    .any(|item| item.as_str().map(is_protected_ref).unwrap_or(false));
1197            }
1198            false
1199        }
1200        _ => false,
1201    }
1202}
1203
1204/// Check top-level `workflow:` rules for MR trigger.
1205fn has_mr_trigger_in_workflow(wf: &Value) -> bool {
1206    let rules = match wf
1207        .as_mapping()
1208        .and_then(|m| m.get("rules"))
1209        .and_then(|r| r.as_sequence())
1210    {
1211        Some(r) => r,
1212        None => return false,
1213    };
1214    for rule in rules {
1215        if let Some(if_expr) = rule
1216            .as_mapping()
1217            .and_then(|m| m.get("if"))
1218            .and_then(|v| v.as_str())
1219        {
1220            // F2: see `job_has_mr_trigger` — only the positive equality form
1221            // counts; negations are rejected.
1222            if matches_mr_event(if_expr) {
1223                return true;
1224            }
1225        }
1226    }
1227    false
1228}
1229
1230/// Returns true when `if_expr` positively asserts that the pipeline source IS
1231/// `merge_request_event`. Accepts `$CI_PIPELINE_SOURCE == "merge_request_event"`
1232/// (and quoted/`||`/`&&` variants) at the truthy-comparison level. Rejects the
1233/// `!=` negation form. Falls back to `false` for anything we can't parse — the
1234/// caller always treats that as "no MR trigger detected".
1235fn matches_mr_event(if_expr: &str) -> bool {
1236    // We don't have a `var == "merge_request_event"` pseudo-variable, so we
1237    // synthesise one: split on `||` ourselves and look for any disjunct that
1238    // is exactly `$CI_PIPELINE_SOURCE == "merge_request_event"` (with
1239    // tolerable whitespace and quoting variations).
1240    fn atom_is_mr_event(atom: &str) -> bool {
1241        let s = atom.trim().trim_matches('(').trim_matches(')').trim();
1242        let (lhs, rhs) = match s.split_once("==") {
1243            Some(parts) => parts,
1244            None => return false,
1245        };
1246        let lhs = lhs.trim();
1247        let rhs_norm = rhs.trim().trim_matches('"').trim_matches('\'');
1248        // Either side may carry the variable; the other must equal the literal.
1249        let lhs_unq = lhs.trim_matches('"').trim_matches('\'');
1250        let rhs_raw = rhs.trim().trim_matches('"').trim_matches('\'');
1251        if (lhs_unq == "$CI_PIPELINE_SOURCE" && rhs_norm == "merge_request_event")
1252            || (rhs_raw == "$CI_PIPELINE_SOURCE" && lhs_unq == "merge_request_event")
1253        {
1254            return true;
1255        }
1256        false
1257    }
1258    let trimmed = if_expr.trim();
1259    // Top-level `||` short-circuit: any positive disjunct wins.
1260    if let Some((lhs, rhs)) = split_top_level(trimmed, "||") {
1261        return atom_is_mr_event(&lhs) || matches_mr_event(&rhs);
1262    }
1263    // For `&&`, accept if any conjunct is a positive `merge_request_event`
1264    // comparison. We don't try to detect contradictory conjuncts —
1265    // `merge_request_event` is a string literal, not a boolean, so the
1266    // truthiness short-circuiting in `check_truthy_comparison` doesn't apply.
1267    if let Some((lhs, rhs)) = split_top_level(trimmed, "&&") {
1268        return atom_is_mr_event(&lhs) || matches_mr_event(&rhs);
1269    }
1270    atom_is_mr_event(trimmed)
1271}
1272
1273/// Structured representation of a single `include:` entry.
1274///
1275/// Serialised into `AuthorityGraph::metadata[META_GITLAB_INCLUDES]` so that
1276/// downstream rules (e.g. `unpinned_include_remote_or_branch_ref`) can analyse
1277/// remote-URL pins, project refs, and missing `ref:` defaults without re-parsing
1278/// the YAML.
1279#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
1280pub struct IncludeEntry {
1281    /// Include kind: `local`, `remote`, `template`, `project`, `component`, or
1282    /// `unknown` for shapes we don't recognise.
1283    pub kind: String,
1284    /// The path / URL / project string the include points at.
1285    pub target: String,
1286    /// The resolved `ref:` value. Empty string when the include omits a `ref:`
1287    /// (defaults to HEAD on the source repo, which is itself a finding).
1288    pub git_ref: String,
1289}
1290
1291/// Parse the top-level `include:` value into a flat list of `IncludeEntry`s.
1292///
1293/// `include:` accepts five shapes — string, sequence-of-strings, sequence-of-mappings,
1294/// sequence-of-strings-mixed-with-mappings, and a single mapping. Normalise all of
1295/// them into one flat list so the rule layer doesn't have to.
1296pub fn extract_include_entries(v: &Value) -> Vec<IncludeEntry> {
1297    let mut out = Vec::new();
1298    match v {
1299        // `include: 'path/to/local.yml'` — sugar for a local include
1300        Value::String(s) => {
1301            out.push(IncludeEntry {
1302                kind: classify_string_include(s).into(),
1303                target: s.clone(),
1304                git_ref: String::new(),
1305            });
1306        }
1307        Value::Sequence(seq) => {
1308            for item in seq {
1309                match item {
1310                    Value::String(s) => {
1311                        out.push(IncludeEntry {
1312                            kind: classify_string_include(s).into(),
1313                            target: s.clone(),
1314                            git_ref: String::new(),
1315                        });
1316                    }
1317                    Value::Mapping(m) => {
1318                        if let Some(e) = include_entry_from_mapping(m) {
1319                            out.push(e);
1320                        }
1321                    }
1322                    _ => {}
1323                }
1324            }
1325        }
1326        Value::Mapping(m) => {
1327            if let Some(e) = include_entry_from_mapping(m) {
1328                out.push(e);
1329            }
1330        }
1331        _ => {}
1332    }
1333    out
1334}
1335
1336/// Heuristic: a top-level `include:` string that looks like an HTTPS URL is a
1337/// `remote:` include in shorthand form; everything else is a `local:` path.
1338fn classify_string_include(s: &str) -> &'static str {
1339    let lower = s.to_ascii_lowercase();
1340    if lower.starts_with("http://") || lower.starts_with("https://") {
1341        "remote"
1342    } else {
1343        "local"
1344    }
1345}
1346
1347/// Lift one of the four mapping forms (`local:`, `remote:`, `template:`,
1348/// `project:`, `component:`) into an `IncludeEntry`. Returns None when the
1349/// mapping has none of the recognised keys.
1350fn include_entry_from_mapping(m: &serde_yaml::Mapping) -> Option<IncludeEntry> {
1351    let str_at = |key: &str| {
1352        m.get(key)
1353            .and_then(|v| v.as_str())
1354            .map(str::to_string)
1355            .unwrap_or_default()
1356    };
1357    if let Some(s) = m.get("local").and_then(|v| v.as_str()) {
1358        return Some(IncludeEntry {
1359            kind: "local".into(),
1360            target: s.to_string(),
1361            git_ref: String::new(),
1362        });
1363    }
1364    if let Some(s) = m.get("remote").and_then(|v| v.as_str()) {
1365        return Some(IncludeEntry {
1366            kind: "remote".into(),
1367            target: s.to_string(),
1368            git_ref: String::new(),
1369        });
1370    }
1371    if let Some(s) = m.get("template").and_then(|v| v.as_str()) {
1372        return Some(IncludeEntry {
1373            kind: "template".into(),
1374            target: s.to_string(),
1375            git_ref: String::new(),
1376        });
1377    }
1378    if let Some(s) = m.get("component").and_then(|v| v.as_str()) {
1379        // GitLab CI/CD components: source@version → version is the pin
1380        let (target, git_ref) = match s.rsplit_once('@') {
1381            Some((path, ver)) => (path.to_string(), ver.to_string()),
1382            None => (s.to_string(), String::new()),
1383        };
1384        return Some(IncludeEntry {
1385            kind: "component".into(),
1386            target,
1387            git_ref,
1388        });
1389    }
1390    if m.contains_key("project") {
1391        let project = str_at("project");
1392        // ref: may be missing → empty string indicates HEAD/default branch,
1393        // which is itself a supply-chain finding.
1394        let git_ref = str_at("ref");
1395        return Some(IncludeEntry {
1396            kind: "project".into(),
1397            target: project,
1398            git_ref,
1399        });
1400    }
1401    None
1402}
1403
1404/// Extract a flat list of template names from an `extends:` value.
1405/// `extends:` accepts a single string or a sequence of strings.
1406fn extract_extends_list(v: Option<&Value>) -> Vec<String> {
1407    let v = match v {
1408        Some(v) => v,
1409        None => return Vec::new(),
1410    };
1411    match v {
1412        Value::String(s) => vec![s.clone()],
1413        Value::Sequence(seq) => seq
1414            .iter()
1415            .filter_map(|i| i.as_str().map(str::to_string))
1416            .collect(),
1417        _ => Vec::new(),
1418    }
1419}
1420
1421/// Returns true when any entry in `services:` has an image name matching
1422/// `docker:*-dind` (or bare `docker:dind`). Recognises both shapes:
1423/// `services: [docker:dind]` and `services: [{name: docker:dind}]`.
1424fn job_services_have_dind(services: Option<&Value>) -> bool {
1425    let list = match services.and_then(|v| v.as_sequence()) {
1426        Some(s) => s,
1427        None => return false,
1428    };
1429    for item in list {
1430        let img = match extract_image_str(item) {
1431            Some(s) => s,
1432            None => continue,
1433        };
1434        if image_is_dind(&img) {
1435            return true;
1436        }
1437    }
1438    false
1439}
1440
1441/// Match `docker:dind`, `docker:24.0-dind`, `docker:24-dind`,
1442/// `docker:24.0.7-dind-rootless`, etc. The discriminator is a `docker:` prefix
1443/// AND `dind` appearing somewhere in the tag.
1444fn image_is_dind(image: &str) -> bool {
1445    let lower = image.to_ascii_lowercase();
1446    // Match the official docker dind images and their digest-pinned variants.
1447    // Strip any `@sha256:...` suffix before checking the tag.
1448    let bare = match lower.split_once('@') {
1449        Some((b, _)) => b,
1450        None => &lower,
1451    };
1452    if !bare.starts_with("docker:") && !bare.starts_with("docker/") {
1453        return false;
1454    }
1455    bare.contains("dind")
1456}
1457
1458/// Classify a `trigger:` block as either `static` (in-tree YAML / fixed
1459/// downstream project) or `dynamic` (include from a previous job's artifact —
1460/// dynamic child pipelines, the code-injection sink). Returns None when no
1461/// `trigger:` block is present.
1462fn classify_trigger(trigger: Option<&Value>) -> Option<&'static str> {
1463    let t = trigger?;
1464    // Shorthand: `trigger: my/downstream/project` → static
1465    if t.is_string() {
1466        return Some("static");
1467    }
1468    let m = t.as_mapping()?;
1469    // Look at every `include:` entry under trigger; if ANY one references an
1470    // `artifact:` field, the child pipeline is dynamic.
1471    if let Some(inc) = m.get("include") {
1472        if include_has_artifact_source(inc) {
1473            return Some("dynamic");
1474        }
1475    }
1476    Some("static")
1477}
1478
1479/// Walk a `trigger.include:` value (string / sequence / mapping) and return
1480/// true when any entry's mapping carries an `artifact:` key.
1481fn include_has_artifact_source(v: &Value) -> bool {
1482    match v {
1483        Value::Mapping(m) => m.contains_key("artifact"),
1484        Value::Sequence(seq) => seq.iter().any(|i| {
1485            i.as_mapping()
1486                .map(|m| m.contains_key("artifact"))
1487                .unwrap_or(false)
1488        }),
1489        _ => false,
1490    }
1491}
1492
1493/// Extract `(cache.key, cache.policy)` from a job's `cache:` value. Returns
1494/// `None` when no cache is declared. `cache:` may be a sequence of mappings
1495/// (multiple caches); we capture the first key/policy pair so the rule layer
1496/// has at least one signal — multi-cache analysis is left to a future
1497/// extension.
1498///
1499/// `cache.key:` may be:
1500/// - a string: `key: vendor`
1501/// - a mapping: `key: { files: [Gemfile.lock] }` → captured as `files:Gemfile.lock,...`
1502/// - a mapping with `prefix:` → captured as `prefix:<value>`
1503fn extract_cache_key_policy(v: Option<&Value>) -> Option<(String, Option<String>)> {
1504    let v = v?;
1505    let m = match v {
1506        Value::Mapping(m) => m,
1507        Value::Sequence(seq) => {
1508            // First cache wins — same heuristic used elsewhere.
1509            return seq
1510                .iter()
1511                .find_map(|i| i.as_mapping().and_then(extract_cache_key_policy_map));
1512        }
1513        _ => return None,
1514    };
1515    extract_cache_key_policy_map(m)
1516}
1517
1518fn extract_cache_key_policy_map(m: &serde_yaml::Mapping) -> Option<(String, Option<String>)> {
1519    let key = match m.get("key") {
1520        Some(Value::String(s)) => s.clone(),
1521        Some(Value::Number(n)) => n.to_string(),
1522        Some(Value::Bool(b)) => b.to_string(),
1523        Some(Value::Mapping(km)) => {
1524            let mut parts = Vec::new();
1525            if let Some(prefix) = km.get("prefix").and_then(|v| v.as_str()) {
1526                parts.push(format!("prefix:{prefix}"));
1527            }
1528            if let Some(files) = km.get("files").and_then(|v| v.as_sequence()) {
1529                let names: Vec<String> = files
1530                    .iter()
1531                    .filter_map(|f| f.as_str().map(str::to_string))
1532                    .collect();
1533                if !names.is_empty() {
1534                    parts.push(format!("files:{}", names.join(",")));
1535                }
1536            }
1537            if parts.is_empty() {
1538                String::new()
1539            } else {
1540                parts.join(";")
1541            }
1542        }
1543        _ => String::new(),
1544    };
1545    let policy = m.get("policy").and_then(|v| v.as_str()).map(str::to_string);
1546    Some((key, policy))
1547}
1548
1549#[cfg(test)]
1550mod tests {
1551    use super::*;
1552
1553    fn parse(yaml: &str) -> AuthorityGraph {
1554        let parser = GitlabParser;
1555        let source = PipelineSource {
1556            file: ".gitlab-ci.yml".into(),
1557            repo: None,
1558            git_ref: None,
1559            commit_sha: None,
1560        };
1561        parser.parse(yaml, &source).unwrap()
1562    }
1563
1564    #[test]
1565    fn ci_job_token_always_present() {
1566        let yaml = r#"
1567stages:
1568  - build
1569
1570build-job:
1571  stage: build
1572  script:
1573    - make build
1574"#;
1575        let graph = parse(yaml);
1576        let identities: Vec<_> = graph.nodes_of_kind(NodeKind::Identity).collect();
1577        assert_eq!(identities.len(), 1);
1578        assert_eq!(identities[0].name, "CI_JOB_TOKEN");
1579        assert_eq!(
1580            identities[0]
1581                .metadata
1582                .get(META_IMPLICIT)
1583                .map(String::as_str),
1584            Some("true")
1585        );
1586        assert_eq!(
1587            identities[0]
1588                .metadata
1589                .get(META_IDENTITY_SCOPE)
1590                .map(String::as_str),
1591            Some("broad")
1592        );
1593    }
1594
1595    #[test]
1596    fn global_credential_variable_emits_secret_node() {
1597        let yaml = r#"
1598variables:
1599  APP_VERSION: "1.0"
1600  DEPLOY_TOKEN: "$CI_DEPLOY_TOKEN"
1601
1602build-job:
1603  script:
1604    - make
1605"#;
1606        let graph = parse(yaml);
1607        let secrets: Vec<_> = graph.nodes_of_kind(NodeKind::Secret).collect();
1608        assert!(
1609            secrets.iter().any(|s| s.name == "DEPLOY_TOKEN"),
1610            "DEPLOY_TOKEN must emit a Secret node, got: {:?}",
1611            secrets.iter().map(|s| &s.name).collect::<Vec<_>>()
1612        );
1613        // Plain config variable must not emit Secret
1614        assert!(
1615            !secrets.iter().any(|s| s.name == "APP_VERSION"),
1616            "APP_VERSION must not emit a Secret node"
1617        );
1618    }
1619
1620    #[test]
1621    fn floating_image_emits_untrusted_image_node() {
1622        let yaml = r#"
1623deploy:
1624  image: alpine:latest
1625  script:
1626    - deploy.sh
1627"#;
1628        let graph = parse(yaml);
1629        let images: Vec<_> = graph.nodes_of_kind(NodeKind::Image).collect();
1630        assert_eq!(images.len(), 1);
1631        assert_eq!(images[0].name, "alpine:latest");
1632        assert_eq!(images[0].trust_zone, TrustZone::Untrusted);
1633    }
1634
1635    #[test]
1636    fn digest_pinned_image_is_third_party() {
1637        let yaml = r#"
1638deploy:
1639  image: "alpine@sha256:a5ac7e51b41094c92402da3b24376905380afc29a5ac7e51b41094c92402da3b"
1640  script:
1641    - deploy.sh
1642"#;
1643        let graph = parse(yaml);
1644        let images: Vec<_> = graph.nodes_of_kind(NodeKind::Image).collect();
1645        assert_eq!(images.len(), 1);
1646        assert_eq!(images[0].trust_zone, TrustZone::ThirdParty);
1647    }
1648
1649    #[test]
1650    fn id_tokens_emit_oidc_identity_nodes() {
1651        let yaml = r#"
1652deploy:
1653  id_tokens:
1654    SIGSTORE_ID_TOKEN:
1655      aud: sigstore
1656    AWS_OIDC_TOKEN:
1657      aud: https://sts.amazonaws.com
1658  script:
1659    - deploy.sh
1660"#;
1661        let graph = parse(yaml);
1662        let oidc: Vec<_> = graph
1663            .nodes_of_kind(NodeKind::Identity)
1664            .filter(|n| n.metadata.get(META_OIDC).map(String::as_str) == Some("true"))
1665            .collect();
1666        assert_eq!(
1667            oidc.len(),
1668            2,
1669            "expected 2 OIDC identity nodes, got: {:?}",
1670            oidc.iter().map(|n| &n.name).collect::<Vec<_>>()
1671        );
1672    }
1673
1674    #[test]
1675    fn explicit_secrets_emit_secret_nodes() {
1676        let yaml = r#"
1677deploy:
1678  secrets:
1679    DATABASE_PASSWORD:
1680      vault: production/db/password@secret
1681    AWS_KEY:
1682      aws_secrets_manager:
1683        name: my-secret
1684  script:
1685    - deploy.sh
1686"#;
1687        let graph = parse(yaml);
1688        let secrets: Vec<_> = graph.nodes_of_kind(NodeKind::Secret).collect();
1689        let names: Vec<_> = secrets.iter().map(|s| s.name.as_str()).collect();
1690        assert!(names.contains(&"DATABASE_PASSWORD"), "got: {names:?}");
1691        assert!(names.contains(&"AWS_KEY"), "got: {names:?}");
1692    }
1693
1694    #[test]
1695    fn rules_mr_trigger_sets_meta_trigger() {
1696        let yaml = r#"
1697test:
1698  rules:
1699    - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
1700  script:
1701    - run tests
1702"#;
1703        let graph = parse(yaml);
1704        assert_eq!(
1705            graph.metadata.get(META_TRIGGER).map(String::as_str),
1706            Some("merge_request"),
1707            "META_TRIGGER must be set to merge_request"
1708        );
1709    }
1710
1711    #[test]
1712    fn only_merge_requests_sets_meta_trigger() {
1713        let yaml = r#"
1714test:
1715  only:
1716    - merge_requests
1717  script:
1718    - run tests
1719"#;
1720        let graph = parse(yaml);
1721        assert_eq!(
1722            graph.metadata.get(META_TRIGGER).map(String::as_str),
1723            Some("merge_request")
1724        );
1725    }
1726
1727    #[test]
1728    fn include_marks_graph_partial() {
1729        let yaml = r#"
1730include:
1731  - local: '/templates/.base.yml'
1732
1733build:
1734  script:
1735    - make
1736"#;
1737        let graph = parse(yaml);
1738        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
1739        assert_eq!(graph.completeness_gap_kinds[0], GapKind::Structural);
1740    }
1741
1742    #[test]
1743    fn extends_marks_graph_partial() {
1744        let yaml = r#"
1745.base:
1746  script:
1747    - echo base
1748
1749my-job:
1750  extends: .base
1751  stage: build
1752"#;
1753        let graph = parse(yaml);
1754        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
1755        // Two structural gaps: the hidden `.base` template job and the
1756        // `extends:` inheritance on my-job.
1757        assert!(
1758            graph
1759                .completeness_gap_kinds
1760                .iter()
1761                .all(|k| *k == GapKind::Structural),
1762            "expected all gaps Structural, got: {:?}",
1763            graph.completeness_gap_kinds
1764        );
1765    }
1766
1767    #[test]
1768    fn meta_job_name_set_on_step_nodes() {
1769        let yaml = r#"
1770build:
1771  script:
1772    - make
1773deploy:
1774  script:
1775    - deploy.sh
1776"#;
1777        let graph = parse(yaml);
1778        let steps: Vec<_> = graph.nodes_of_kind(NodeKind::Step).collect();
1779        assert_eq!(steps.len(), 2);
1780        for step in &steps {
1781            assert!(
1782                step.metadata.contains_key(META_JOB_NAME),
1783                "Step '{}' missing META_JOB_NAME",
1784                step.name
1785            );
1786        }
1787        // Verify job names are correct
1788        let names: Vec<_> = steps
1789            .iter()
1790            .map(|s| s.metadata.get(META_JOB_NAME).unwrap().as_str())
1791            .collect();
1792        assert!(names.contains(&"build"), "got: {names:?}");
1793        assert!(names.contains(&"deploy"), "got: {names:?}");
1794    }
1795
1796    #[test]
1797    fn reserved_keywords_not_parsed_as_jobs() {
1798        let yaml = r#"
1799stages:
1800  - build
1801  - test
1802
1803variables:
1804  MY_VAR: value
1805
1806image: alpine:latest
1807
1808build:
1809  stage: build
1810  script:
1811    - make
1812"#;
1813        let graph = parse(yaml);
1814        let steps: Vec<_> = graph.nodes_of_kind(NodeKind::Step).collect();
1815        assert_eq!(
1816            steps.len(),
1817            1,
1818            "only 'build' should be a Step, got: {:?}",
1819            steps.iter().map(|s| &s.name).collect::<Vec<_>>()
1820        );
1821        assert_eq!(steps[0].name, "build");
1822    }
1823
1824    #[test]
1825    fn services_emit_image_nodes() {
1826        let yaml = r#"
1827test:
1828  services:
1829    - docker:dind
1830    - name: postgres:14
1831  script:
1832    - run_tests
1833"#;
1834        let graph = parse(yaml);
1835        let images: Vec<_> = graph.nodes_of_kind(NodeKind::Image).collect();
1836        assert_eq!(
1837            images.len(),
1838            2,
1839            "expected 2 service Image nodes, got: {:?}",
1840            images.iter().map(|i| &i.name).collect::<Vec<_>>()
1841        );
1842    }
1843
1844    // ── Cross-platform misclassification trap (red-team R2 #5) ─────
1845
1846    #[test]
1847    fn job_carrier_with_unparseable_bodies_marks_partial() {
1848        // Top-level keys that look like job names but whose values are not
1849        // mappings (lists, scalars). GitLab parser would normally produce a
1850        // Step per non-reserved mapping-valued key; here every candidate is
1851        // skipped because the value is not a mapping. Result: 0 step nodes
1852        // despite a non-empty job carrier — must mark Partial.
1853        let yaml = r#"
1854build:
1855  - this is a list, not a mapping
1856test:
1857  - also a list
1858"#;
1859        let graph = parse(yaml);
1860        let step_count = graph
1861            .nodes
1862            .iter()
1863            .filter(|n| n.kind == NodeKind::Step)
1864            .count();
1865        // Note: the "had_job_carrier" heuristic only fires when the value IS
1866        // a mapping, so this case (non-mapping values) does NOT trigger the
1867        // partial — that's intentional. The heuristic targets the trap where
1868        // an attacker uses a *valid mapping shape* the GitLab parser can't
1869        // interpret.
1870        assert_eq!(step_count, 0);
1871        assert_eq!(
1872            graph.completeness,
1873            AuthorityCompleteness::Complete,
1874            "non-mapping values are not job carriers"
1875        );
1876    }
1877
1878    // ── Regression tests for F1-F6 (gitlab-parser deep review) ──────────
1879
1880    /// F1: `$CI_COMMIT_REF_PROTECTED == "true"` stamps protected-only;
1881    /// the negation `== "false"` must NOT stamp.
1882    #[test]
1883    fn protected_ref_only_stamps_meta_when_truly_positive() {
1884        let positive = r#"
1885deploy:
1886  rules:
1887    - if: '$CI_COMMIT_REF_PROTECTED == "true"'
1888  script:
1889    - deploy.sh
1890"#;
1891        let graph = parse(positive);
1892        let step = graph.nodes_of_kind(NodeKind::Step).next().unwrap();
1893        assert_eq!(
1894            step.metadata
1895                .get(META_RULES_PROTECTED_ONLY)
1896                .map(String::as_str),
1897            Some("true"),
1898            "positive == \"true\" comparison must stamp META_RULES_PROTECTED_ONLY"
1899        );
1900
1901        let negation = r#"
1902deploy:
1903  rules:
1904    - if: '$CI_COMMIT_REF_PROTECTED == "false"'
1905  script:
1906    - deploy.sh
1907"#;
1908        let graph = parse(negation);
1909        let step = graph.nodes_of_kind(NodeKind::Step).next().unwrap();
1910        assert!(
1911            !step.metadata.contains_key(META_RULES_PROTECTED_ONLY),
1912            "== \"false\" is the OPPOSITE signal — must NOT stamp META_RULES_PROTECTED_ONLY (got: {:?})",
1913            step.metadata.get(META_RULES_PROTECTED_ONLY)
1914        );
1915
1916        // `!= "true"` is also a negation — must not stamp.
1917        let inequality = r#"
1918deploy:
1919  rules:
1920    - if: '$CI_COMMIT_REF_PROTECTED != "true"'
1921  script:
1922    - deploy.sh
1923"#;
1924        let graph = parse(inequality);
1925        let step = graph.nodes_of_kind(NodeKind::Step).next().unwrap();
1926        assert!(
1927            !step.metadata.contains_key(META_RULES_PROTECTED_ONLY),
1928            "!= \"true\" is a negation — must NOT stamp META_RULES_PROTECTED_ONLY"
1929        );
1930
1931        // `$CI_COMMIT_TAG_MESSAGE` substring trap — used to match because
1932        // `if_expr.contains("$CI_COMMIT_TAG")` was true even though the var
1933        // is a different one.
1934        let tag_message_trap = r#"
1935deploy:
1936  rules:
1937    - if: '$CI_COMMIT_TAG_MESSAGE == "release"'
1938  script:
1939    - deploy.sh
1940"#;
1941        let graph = parse(tag_message_trap);
1942        let step = graph.nodes_of_kind(NodeKind::Step).next().unwrap();
1943        assert!(
1944            !step.metadata.contains_key(META_RULES_PROTECTED_ONLY),
1945            "$CI_COMMIT_TAG_MESSAGE must not match the $CI_COMMIT_TAG predicate"
1946        );
1947    }
1948
1949    /// F2: `$CI_PIPELINE_SOURCE != "merge_request_event"` ("run except on MRs")
1950    /// must NOT stamp `META_TRIGGER=merge_request`. Only the positive form
1951    /// counts.
1952    #[test]
1953    fn mr_trigger_detection_rejects_negation() {
1954        let negation = r#"
1955build:
1956  rules:
1957    - if: '$CI_PIPELINE_SOURCE != "merge_request_event"'
1958  script:
1959    - make build
1960"#;
1961        let graph = parse(negation);
1962        assert!(
1963            graph.metadata.get(META_TRIGGER).map(String::as_str) != Some("merge_request"),
1964            "negation form must not stamp META_TRIGGER=merge_request, got: {:?}",
1965            graph.metadata.get(META_TRIGGER)
1966        );
1967        let steps: Vec<_> = graph.nodes_of_kind(NodeKind::Step).collect();
1968        assert_eq!(steps.len(), 1);
1969        assert!(
1970            steps[0].metadata.get(META_TRIGGER).map(String::as_str) != Some("merge_request"),
1971            "negation form must not stamp per-step META_TRIGGER=merge_request"
1972        );
1973
1974        // Positive form still works.
1975        let positive = r#"
1976build:
1977  rules:
1978    - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
1979  script:
1980    - make build
1981"#;
1982        let graph = parse(positive);
1983        assert_eq!(
1984            graph.metadata.get(META_TRIGGER).map(String::as_str),
1985            Some("merge_request"),
1986            "positive form must still stamp META_TRIGGER=merge_request"
1987        );
1988    }
1989
1990    /// F3: list-form `aud:` produces `META_OIDC_AUDIENCES` (plural) and a
1991    /// comma-joined `META_OIDC_AUDIENCE` for backward compat. Scalar form
1992    /// stamps only `META_OIDC_AUDIENCE` and leaves the plural marker absent.
1993    #[test]
1994    fn id_tokens_aud_list_form_creates_audiences_metadata() {
1995        let yaml = r#"
1996deploy:
1997  id_tokens:
1998    MULTI_CLOUD_TOKEN:
1999      aud:
2000        - https://aws.amazonaws.com
2001        - https://gcp.googleapis.com
2002  script:
2003    - deploy.sh
2004"#;
2005        let graph = parse(yaml);
2006        let oidc: Vec<_> = graph
2007            .nodes_of_kind(NodeKind::Identity)
2008            .filter(|n| n.metadata.get(META_OIDC).map(String::as_str) == Some("true"))
2009            .collect();
2010        assert_eq!(oidc.len(), 1);
2011        assert_eq!(
2012            oidc[0]
2013                .metadata
2014                .get(META_OIDC_AUDIENCES)
2015                .map(String::as_str),
2016            Some("https://aws.amazonaws.com,https://gcp.googleapis.com"),
2017            "list-form aud must stamp comma-joined META_OIDC_AUDIENCES"
2018        );
2019        // Backward compat: META_OIDC_AUDIENCE holds the same comma-joined value
2020        // (no longer "unknown" as it was before the fix).
2021        assert_eq!(
2022            oidc[0].metadata.get(META_OIDC_AUDIENCE).map(String::as_str),
2023            Some("https://aws.amazonaws.com,https://gcp.googleapis.com"),
2024        );
2025        assert!(oidc[0].name.contains("aud=https://aws"));
2026
2027        // Scalar form: META_OIDC_AUDIENCE is the bare string, plural marker absent.
2028        let scalar = r#"
2029deploy:
2030  id_tokens:
2031    AWS_TOKEN:
2032      aud: https://sts.amazonaws.com
2033  script:
2034    - deploy.sh
2035"#;
2036        let graph = parse(scalar);
2037        let oidc: Vec<_> = graph
2038            .nodes_of_kind(NodeKind::Identity)
2039            .filter(|n| n.metadata.get(META_OIDC).map(String::as_str) == Some("true"))
2040            .collect();
2041        assert_eq!(
2042            oidc[0].metadata.get(META_OIDC_AUDIENCE).map(String::as_str),
2043            Some("https://sts.amazonaws.com")
2044        );
2045        assert!(
2046            !oidc[0].metadata.contains_key(META_OIDC_AUDIENCES),
2047            "scalar form must NOT set the plural META_OIDC_AUDIENCES marker"
2048        );
2049    }
2050
2051    /// F4: `is_credential_name` must boundary-check; substring matches like
2052    /// `CERTAIN_FLAG` (contains `CERT`), `TOKENIZER_VERSION` (contains `TOKEN`),
2053    /// `UNCERTAIN`, and `CERTIFICATE_PATH` (path config, not a credential)
2054    /// must all return false. Real credentials still match.
2055    #[test]
2056    fn is_credential_name_boundary_checks() {
2057        // False positives that the substring matcher used to flag.
2058        assert!(!is_credential_name("CERTAIN_FLAG"));
2059        assert!(!is_credential_name("TOKENIZER_VERSION"));
2060        assert!(!is_credential_name("UNCERTAIN"));
2061        assert!(!is_credential_name("CERTIFICATE_PATH"));
2062        assert!(!is_credential_name("TOKEN1"));
2063        assert!(!is_credential_name("CERTIFICATE"));
2064
2065        // True positives — must still match.
2066        assert!(is_credential_name("API_TOKEN"));
2067        assert!(is_credential_name("MY_CERT"));
2068        assert!(is_credential_name("DB_PASSWORD"));
2069        assert!(is_credential_name("DEPLOY_TOKEN"));
2070        assert!(is_credential_name("SIGNING_KEY"));
2071        assert!(is_credential_name("AWS_SECRET_ACCESS_KEY"));
2072        assert!(is_credential_name("TOKEN"));
2073        assert!(is_credential_name("CERT"));
2074        assert!(is_credential_name("PRIVATE_KEY"));
2075        assert!(is_credential_name("CREDENTIAL"));
2076    }
2077
2078    /// F5: a `needs:` entry with `artifacts: false` does NOT promote the
2079    /// upstream's dotenv into this job, so it must be excluded from
2080    /// `META_NEEDS` (the dotenv-flow rule reads that CSV verbatim).
2081    #[test]
2082    fn needs_artifacts_false_excludes_dotenv_flow() {
2083        let yaml = r#"
2084build:
2085  artifacts:
2086    reports:
2087      dotenv: build.env
2088  script:
2089    - make build
2090deploy:
2091  needs:
2092    - job: build
2093      artifacts: false
2094  script:
2095    - kubectl apply
2096"#;
2097        let graph = parse(yaml);
2098        let deploy_step = graph
2099            .nodes_of_kind(NodeKind::Step)
2100            .find(|n| n.metadata.get(META_JOB_NAME).map(String::as_str) == Some("deploy"))
2101            .expect("deploy step present");
2102        let needs_csv = deploy_step
2103            .metadata
2104            .get(META_NEEDS)
2105            .map(String::as_str)
2106            .unwrap_or("");
2107        assert!(
2108            !needs_csv.split(',').any(|s| s == "build"),
2109            "build must be excluded from META_NEEDS when artifacts: false (got: {needs_csv:?})"
2110        );
2111
2112        // Sanity check: same YAML with `artifacts: true` (or missing) still
2113        // includes the upstream so dotenv-flow rules can fire.
2114        let yaml_default = r#"
2115build:
2116  artifacts:
2117    reports:
2118      dotenv: build.env
2119  script:
2120    - make build
2121deploy:
2122  needs:
2123    - job: build
2124  script:
2125    - kubectl apply
2126"#;
2127        let graph = parse(yaml_default);
2128        let deploy_step = graph
2129            .nodes_of_kind(NodeKind::Step)
2130            .find(|n| n.metadata.get(META_JOB_NAME).map(String::as_str) == Some("deploy"))
2131            .expect("deploy step present");
2132        let needs_csv = deploy_step
2133            .metadata
2134            .get(META_NEEDS)
2135            .map(String::as_str)
2136            .unwrap_or("");
2137        assert!(
2138            needs_csv.split(',').any(|s| s == "build"),
2139            "default (artifacts implicitly true) must keep build in META_NEEDS (got: {needs_csv:?})"
2140        );
2141    }
2142
2143    /// F6: 9× parse with bucket-defeating key names — even if a future
2144    /// refactor swapped the indexmap-backed mapping for a HashMap-backed
2145    /// one, the explicit sort would keep NodeId order byte-identical.
2146    #[test]
2147    fn gitlab_mapping_iteration_is_deterministic_across_runs() {
2148        // Names chosen to spread across hash buckets.
2149        let yaml = r#"
2150zeta-job:
2151  variables:
2152    ZZ_TOKEN: "$CI_TOKEN"
2153    AA_PASSWORD: "x"
2154    MM_SECRET: "y"
2155  script:
2156    - echo zeta
2157alpha-job:
2158  variables:
2159    QQ_TOKEN: "$CI_TOKEN"
2160    BB_API_KEY: "z"
2161  script:
2162    - echo alpha
2163mid-job:
2164  variables:
2165    NN_PRIVATE_KEY: "k"
2166    GG_SIGNING_KEY: "j"
2167  script:
2168    - echo mid
2169"#;
2170        let canonical: Vec<(NodeKind, String)> = parse(yaml)
2171            .nodes
2172            .iter()
2173            .map(|n| (n.kind, n.name.clone()))
2174            .collect();
2175        for run in 0..9 {
2176            let again: Vec<(NodeKind, String)> = parse(yaml)
2177                .nodes
2178                .iter()
2179                .map(|n| (n.kind, n.name.clone()))
2180                .collect();
2181            assert_eq!(
2182                again, canonical,
2183                "run {run}: NodeId order must be byte-identical across runs"
2184            );
2185        }
2186    }
2187
2188    #[test]
2189    fn mapping_jobs_without_recognisable_step_content_marks_partial() {
2190        // A non-reserved top-level key whose value is a mapping but contains
2191        // only ADO-style fields (`task:`, `azureSubscription`) — and `extends`
2192        // marks the job as partial without creating a Step. Wait: the GitLab
2193        // parser actually still adds a Step node for any mapping-valued
2194        // non-reserved key. So to get the 0-step + had_carrier shape, we
2195        // need a hidden/template job (starts with '.') as the only candidate.
2196        let yaml = r#"
2197.template-only:
2198  script:
2199    - echo "this is a template-only file"
2200"#;
2201        let graph = parse(yaml);
2202        let step_count = graph
2203            .nodes
2204            .iter()
2205            .filter(|n| n.kind == NodeKind::Step)
2206            .count();
2207        assert_eq!(step_count, 0);
2208        // Hidden jobs already mark partial with their own reason.
2209        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
2210        // The hidden `.template-only` job is a Structural gap. The zero-steps
2211        // fall-through does NOT fire here because `had_job_carrier` only
2212        // counts non-dot-prefixed mapping-valued top-level keys.
2213        assert_eq!(graph.completeness_gap_kinds[0], GapKind::Structural);
2214    }
2215}