Skip to main content

taudit_parse_gitlab/
lib.rs

1use std::collections::{HashMap, HashSet};
2
3use serde::{Deserialize, Serialize};
4use serde_yaml::Value;
5use taudit_core::error::TauditError;
6use taudit_core::graph::*;
7// Re-import explicitly to make the new constants visible at a glance.
8#[allow(unused_imports)]
9use taudit_core::graph::{META_DOTENV_FILE, META_ENVIRONMENT_NAME, META_NEEDS, META_SCRIPT_BODY};
10use taudit_core::ports::PipelineParser;
11
12/// GitLab CI YAML parser.
13///
14/// Parses `.gitlab-ci.yml` files into an `AuthorityGraph`. The authority model:
15/// - Each job is a `Step` node.
16/// - `CI_JOB_TOKEN` is a global implicit `Identity` (always present, scope=broad).
17/// - `secrets:` entries emit `Secret` nodes with `HasAccessTo` edges.
18/// - `id_tokens:` entries emit OIDC `Identity` nodes.
19/// - `variables:` entries with credential-pattern names emit `Secret` nodes.
20/// - `image:` and `services:` emit `Image` nodes with `UsesImage` edges.
21/// - `include:` and `extends:` mark the graph `Partial`.
22/// - `rules: if: merge_request_event` and `only: merge_requests` set `META_TRIGGER`.
23pub struct GitlabParser;
24
25/// Reserved top-level keys that are not job definitions.
26const RESERVED: &[&str] = &[
27    "stages",
28    "workflow",
29    "include",
30    "variables",
31    "image",
32    "services",
33    "default",
34    "cache",
35    "before_script",
36    "after_script",
37    "types",
38];
39
40/// Variable name fragments that indicate a credential rather than plain config.
41const CRED_FRAGMENTS: &[&str] = &[
42    "TOKEN",
43    "SECRET",
44    "PASSWORD",
45    "PASSWD",
46    "PRIVATE_KEY",
47    "API_KEY",
48    "APIKEY",
49    "SIGNING_KEY",
50    "ACCESS_KEY",
51    "SERVICE_ACCOUNT",
52    "CERT",
53    "CREDENTIAL",
54];
55
56impl PipelineParser for GitlabParser {
57    fn platform(&self) -> &str {
58        "gitlab-ci"
59    }
60
61    fn parse(&self, content: &str, source: &PipelineSource) -> Result<AuthorityGraph, TauditError> {
62        let (parse_content, duplicate_recovery_note) = match parse_gitlab_yaml_value(content) {
63            Ok((root, extra_docs, first_doc_was_spec_header)) => {
64                let mut graph = build_graph_from_root(root, source)?;
65                if extra_docs {
66                    graph.mark_partial(
67                        GapKind::Expression,
68                        if first_doc_was_spec_header {
69                            "file contains GitLab spec: header plus executable config document — analyzed the executable document and preserved spec: as an unresolved header".to_string()
70                        } else {
71                            "file contains multiple YAML documents (--- separator) — only the first was analyzed".to_string()
72                        },
73                    );
74                }
75                return Ok(graph);
76            }
77            Err(e) if is_duplicate_key_parse_error(&e) => {
78                let sanitized = sanitize_duplicate_mapping_keys(content);
79                let note = format!(
80                    "GitLab YAML contained duplicate mapping keys; later duplicates were preserved as opaque __taudit_duplicate_* keys during recovery ({e})"
81                );
82                (sanitized, Some(note))
83            }
84            Err(e) => return Err(TauditError::Parse(format!("YAML parse error: {e}"))),
85        };
86
87        let (root, extra_docs, first_doc_was_spec_header) = parse_gitlab_yaml_value(&parse_content)
88            .map_err(|e| TauditError::Parse(format!("YAML parse error: {e}")))?;
89        let mut graph = build_graph_from_root(root, source)?;
90        if extra_docs {
91            graph.mark_partial(
92                GapKind::Expression,
93                if first_doc_was_spec_header {
94                    "file contains GitLab spec: header plus executable config document — analyzed the executable document and preserved spec: as an unresolved header".to_string()
95                } else {
96                    "file contains multiple YAML documents (--- separator) — only the first was analyzed".to_string()
97                },
98            );
99        }
100        if let Some(note) = duplicate_recovery_note {
101            graph.mark_partial(GapKind::Structural, note);
102        }
103        Ok(graph)
104    }
105}
106
107fn parse_gitlab_yaml_value(content: &str) -> Result<(Value, bool, bool), serde_yaml::Error> {
108    let mut de = serde_yaml::Deserializer::from_str(content);
109    let Some(doc) = de.next() else {
110        return Ok((Value::Null, false, false));
111    };
112    let first = Value::deserialize(doc)?;
113    let Some(second_doc) = de.next() else {
114        return Ok((first, false, false));
115    };
116    if gitlab_doc_is_spec_header(&first) {
117        return Ok((Value::deserialize(second_doc)?, true, true));
118    }
119    Ok((first, true, false))
120}
121
122fn gitlab_doc_is_spec_header(doc: &Value) -> bool {
123    let Some(map) = doc.as_mapping() else {
124        return false;
125    };
126    map.contains_key("spec")
127}
128
129fn is_duplicate_key_parse_error(error: &serde_yaml::Error) -> bool {
130    error.to_string().contains("duplicate entry with key")
131}
132
133fn sanitize_duplicate_mapping_keys(content: &str) -> String {
134    #[derive(Default)]
135    struct Frame {
136        indent: usize,
137        keys: HashSet<String>,
138    }
139
140    let mut out = Vec::new();
141    let mut frames: Vec<Frame> = Vec::new();
142    let mut duplicate_counts: HashMap<(usize, String), usize> = HashMap::new();
143    let mut block_scalar_indent: Option<usize> = None;
144
145    for line in content.lines() {
146        let indent = line.chars().take_while(|c| *c == ' ').count();
147        let trimmed = &line[indent..];
148
149        if let Some(block_indent) = block_scalar_indent {
150            if !trimmed.is_empty() && indent <= block_indent {
151                block_scalar_indent = None;
152            } else {
153                out.push(line.to_string());
154                continue;
155            }
156        }
157
158        if trimmed.is_empty() || trimmed.starts_with('#') {
159            out.push(line.to_string());
160            continue;
161        }
162
163        let (key_indent, key_start, key_end, key) = match yaml_mapping_key_span(line, indent) {
164            Some(parts) => parts,
165            None => {
166                out.push(line.to_string());
167                continue;
168            }
169        };
170
171        while frames.last().is_some_and(|frame| frame.indent > key_indent) {
172            frames.pop();
173        }
174        if !frames.iter().any(|frame| frame.indent == key_indent) {
175            frames.push(Frame {
176                indent: key_indent,
177                keys: HashSet::new(),
178            });
179        }
180        let frame = frames
181            .iter_mut()
182            .rev()
183            .find(|frame| frame.indent == key_indent)
184            .expect("frame inserted above");
185
186        if frame.keys.insert(key.clone()) {
187            out.push(line.to_string());
188        } else {
189            let count = duplicate_counts
190                .entry((key_indent, key.clone()))
191                .and_modify(|n| *n += 1)
192                .or_insert(2);
193            let replacement = format!(
194                "__taudit_duplicate_{}_{}",
195                sanitize_key_fragment(&key),
196                count
197            );
198            let mut rewritten = String::with_capacity(line.len() + replacement.len());
199            rewritten.push_str(&line[..key_start]);
200            rewritten.push_str(&replacement);
201            rewritten.push_str(&line[key_end..]);
202            out.push(rewritten);
203        }
204
205        let value_tail = line[key_end..].trim_start();
206        if value_tail.starts_with(": |") || value_tail.starts_with(": >") {
207            block_scalar_indent = Some(key_indent);
208        }
209    }
210
211    let mut sanitized = out.join("\n");
212    if content.ends_with('\n') {
213        sanitized.push('\n');
214    }
215    sanitized
216}
217
218fn yaml_mapping_key_span(line: &str, indent: usize) -> Option<(usize, usize, usize, String)> {
219    let trimmed = &line[indent..];
220    if trimmed.starts_with('#') {
221        return None;
222    }
223
224    let mut key_indent = indent;
225    let mut key_start = indent;
226    let key_text = if let Some(rest) = trimmed.strip_prefix("- ") {
227        key_indent = indent + 2;
228        key_start = indent + 2;
229        rest
230    } else {
231        trimmed
232    };
233
234    let mut in_single = false;
235    let mut in_double = false;
236    let mut bracket_depth = 0i32;
237    let mut prev = '\0';
238    for (offset, ch) in key_text.char_indices() {
239        match ch {
240            '\'' if !in_double => in_single = !in_single,
241            '"' if !in_single && prev != '\\' => in_double = !in_double,
242            '[' | '{' if !in_single && !in_double => bracket_depth += 1,
243            ']' | '}' if !in_single && !in_double => bracket_depth -= 1,
244            ':' if !in_single && !in_double && bracket_depth == 0 => {
245                let after = key_text[offset + ch.len_utf8()..].chars().next();
246                if after.is_some_and(|c| !c.is_whitespace()) {
247                    prev = ch;
248                    continue;
249                }
250                let raw = &key_text[..offset];
251                let key = raw.trim();
252                if key.is_empty() {
253                    return None;
254                }
255                let leading = raw.len() - raw.trim_start().len();
256                let trailing = raw.trim_end().len();
257                let start = key_start + leading;
258                let end = key_start + trailing;
259                return Some((key_indent, start, end, key.to_string()));
260            }
261            _ => {}
262        }
263        prev = ch;
264    }
265    None
266}
267
268fn sanitize_key_fragment(key: &str) -> String {
269    let mut out = String::new();
270    for c in key.chars() {
271        if c.is_ascii_alphanumeric() {
272            out.push(c.to_ascii_lowercase());
273        } else {
274            out.push('_');
275        }
276    }
277    while out.contains("__") {
278        out = out.replace("__", "_");
279    }
280    out.trim_matches('_').chars().take(48).collect::<String>()
281}
282
283fn build_graph_from_root(
284    root: Value,
285    source: &PipelineSource,
286) -> Result<AuthorityGraph, TauditError> {
287    let mapping = root
288        .as_mapping()
289        .ok_or_else(|| TauditError::Parse("GitLab CI root must be a mapping".into()))?;
290
291    let mut graph = AuthorityGraph::new(source.clone());
292    graph.metadata.insert(META_PLATFORM.into(), "gitlab".into());
293
294    // CI_JOB_TOKEN is always present in every GitLab CI job — it's the built-in
295    // platform token, equivalent to ADO's System.AccessToken or GHA's GITHUB_TOKEN.
296    let mut meta = HashMap::new();
297    meta.insert(META_IDENTITY_SCOPE.into(), "broad".into());
298    meta.insert(META_IMPLICIT.into(), "true".into());
299    let token_id = graph.add_node_with_metadata(
300        NodeKind::Identity,
301        "CI_JOB_TOKEN",
302        TrustZone::FirstParty,
303        meta,
304    );
305
306    // Top-level include: → mark Partial immediately AND capture each
307    // entry's structure as graph metadata so include-pinning rules can
308    // reason about remote URLs and unpinned project refs.
309    if let Some(inc) = mapping.get("include") {
310        graph.mark_partial(
311            GapKind::Structural,
312            "include: directive present — included templates not resolved".to_string(),
313        );
314        let entries = extract_include_entries(inc);
315        if !entries.is_empty() {
316            if let Ok(json) = serde_json::to_string(&entries) {
317                graph.metadata.insert(META_GITLAB_INCLUDES.into(), json);
318            }
319        }
320    }
321
322    // Top-level default: can inject authority-relevant settings into every
323    // job (image/services/variables/secrets/id_tokens/scripts/cache/artifacts).
324    // We currently do not materialize that inheritance chain, so mark Partial
325    // to avoid false completeness.
326    if let Some(default_map) = mapping.get("default").and_then(|v| v.as_mapping()) {
327        if default_contains_authority_relevant_keys(default_map) {
328            graph.mark_partial(
329                GapKind::Structural,
330                "default: contains inherited authority-relevant job settings — inheritance not fully resolved".to_string(),
331            );
332        }
333    }
334
335    // Global variables
336    let global_secrets = process_variables(mapping.get("variables"), &mut graph, "pipeline");
337
338    // Global image
339    let global_image = mapping.get("image").and_then(extract_image_str);
340
341    // Top-level merge_request trigger detection from `workflow:` rules
342    if let Some(wf) = mapping.get("workflow") {
343        if has_mr_trigger_in_workflow(wf) {
344            graph
345                .metadata
346                .insert(META_TRIGGER.into(), "merge_request".into());
347        }
348        if workflow_rules_define_variables(wf) {
349            graph.mark_partial(
350                GapKind::Expression,
351                "workflow:rules:variables define conditional variables — rule expressions not evaluated".to_string(),
352            );
353        }
354    }
355
356    // Process each job (any top-level key not in RESERVED)
357    // determinism: sort by key — same YAML must produce same NodeId order
358    let mut top_level_entries: Vec<(&Value, &Value)> = mapping.iter().collect();
359    top_level_entries.sort_by(|a, b| a.0.as_str().unwrap_or("").cmp(b.0.as_str().unwrap_or("")));
360    for (key, value) in top_level_entries {
361        let job_name = match key.as_str() {
362            Some(k) => k,
363            None => continue,
364        };
365        if RESERVED.contains(&job_name) {
366            continue;
367        }
368
369        // Hidden jobs (starting with a dot) are templates — mark Partial, skip
370        if job_name.starts_with('.') {
371            graph.mark_partial(
372                GapKind::Structural,
373                format!("job '{job_name}' is a hidden/template job — not resolved"),
374            );
375            continue;
376        }
377
378        let job_map = match value.as_mapping() {
379            Some(m) => m,
380            None => continue,
381        };
382
383        // extends: — job template inheritance, can't resolve statically
384        let extends_names = extract_extends_list(job_map.get("extends"));
385        if !extends_names.is_empty() {
386            graph.mark_partial(
387                GapKind::Structural,
388                format!("job '{job_name}' uses extends: — inherited configuration not resolved"),
389            );
390        }
391
392        if rules_define_variables(job_map.get("rules")) {
393            graph.mark_partial(
394                GapKind::Expression,
395                format!(
396                    "job '{job_name}' uses rules:variables — conditional variable scope not resolved"
397                ),
398            );
399        }
400
401        // inherit: controls whether job receives top-level `default:` and
402        // `variables:`. We don't model the inheritance matrix yet.
403        if job_map.contains_key("inherit") {
404            graph.mark_partial(
405                GapKind::Structural,
406                format!("job '{job_name}' uses inherit: — inheritance scope not resolved"),
407            );
408        }
409
410        // Detect PR/MR trigger in this job's rules: or only:
411        let job_triggers_mr = job_has_mr_trigger(job_map);
412
413        // Propagate job MR trigger to graph level
414        if job_triggers_mr && !graph.metadata.contains_key(META_TRIGGER) {
415            graph
416                .metadata
417                .insert(META_TRIGGER.into(), "merge_request".into());
418        }
419
420        // Job-level variables
421        let job_secrets = process_variables(job_map.get("variables"), &mut graph, job_name);
422
423        // Job-level explicit secrets: (Vault, AWS Secrets Manager, GCP, Azure)
424        let explicit_secrets =
425            process_explicit_secrets(job_map.get("secrets"), job_name, &mut graph);
426
427        // Job-level OIDC tokens (id_tokens:)
428        let oidc_identities = process_id_tokens(job_map.get("id_tokens"), job_name, &mut graph);
429
430        // Job image (falls back to global)
431        let job_image_str = job_map
432            .get("image")
433            .and_then(extract_image_str)
434            .or(global_image.as_deref().map(String::from));
435
436        let image_id = job_image_str.as_deref().map(|img| {
437            let pinned = is_docker_digest_pinned(img);
438            let trust_zone = if pinned {
439                TrustZone::ThirdParty
440            } else {
441                TrustZone::Untrusted
442            };
443            let mut imeta = HashMap::new();
444            if let Some(digest) = img.split("@sha256:").nth(1) {
445                imeta.insert(META_DIGEST.into(), format!("sha256:{digest}"));
446            }
447            graph.add_node_with_metadata(NodeKind::Image, img, trust_zone, imeta)
448        });
449
450        // Services (each is an Image node)
451        let service_ids = process_services(job_map.get("services"), &mut graph);
452
453        // Environment — record name as metadata, sets trust boundary marker
454        let env_name = job_map
455            .get("environment")
456            .and_then(extract_environment_name);
457        let env_url = job_map.get("environment").and_then(extract_environment_url);
458
459        // Concatenated script body (before_script + script + after_script).
460        // Stamped on the Step node so script-aware rules (notably
461        // `untrusted_ci_var_in_shell_interpolation` and
462        // `ci_job_token_to_external_api`) can pattern-match without
463        // re-walking the YAML.
464        // Inline script body — concatenate before_script, script, after_script
465        // (each may be a string or a list-of-strings). Stamped on the Step so
466        // script-aware rules can pattern-match without re-parsing YAML.
467        let script_body = extract_script_body(job_map);
468
469        // GitLab `artifacts.reports.dotenv: <file>` — when set, the file's
470        // KEY=value lines are silently promoted to pipeline variables for
471        // any downstream job that consumes this one via `needs:` /
472        // `dependencies:`. Required input to
473        // `dotenv_artifact_flows_to_privileged_deployment`.
474        let dotenv_file = extract_dotenv_file(job_map);
475
476        // Upstream job names consumed via `needs:` / `dependencies:`.
477        // Used to build dotenv-flow chains across stages.
478        let needs = extract_needs(job_map);
479
480        // Detect whether this job's `rules:` / `only:` clause restricts
481        // execution to protected branches (or to the default branch,
482        // which is protected by GitLab default policy). Used by the
483        // `gitlab_deploy_job_missing_protected_branch_only` rule to
484        // detect deployment jobs that lack any branch guard.
485        let protected_only = job_has_protected_branch_restriction(job_map);
486
487        // Create the Step node for this job
488        let mut step_meta = HashMap::new();
489        step_meta.insert(META_JOB_NAME.into(), job_name.to_string());
490        if let Some(ref env) = env_name {
491            step_meta.insert(META_ENVIRONMENT_NAME.into(), env.clone());
492        }
493        if !script_body.is_empty() {
494            step_meta.insert(META_SCRIPT_BODY.into(), script_body);
495        }
496        if let Some(ref f) = dotenv_file {
497            step_meta.insert(META_DOTENV_FILE.into(), f.clone());
498        }
499        if !needs.is_empty() {
500            step_meta.insert(META_NEEDS.into(), needs.join(","));
501        }
502        if let Some(ref url) = env_url {
503            step_meta.insert(META_ENVIRONMENT_URL.into(), url.clone());
504        }
505        // Per-step MR trigger marker — graph-level META_TRIGGER applies to
506        // the file as a whole, but `id_token_audience_overscoped` needs to
507        // compare audience usage between MR-context and protected-context
508        // jobs in the same file.
509        if job_triggers_mr {
510            step_meta.insert(META_TRIGGER.into(), "merge_request".into());
511        }
512        // extends: list (comma-joined, in source order)
513        if !extends_names.is_empty() {
514            step_meta.insert(META_GITLAB_EXTENDS.into(), extends_names.join(","));
515        }
516        // allow_failure: true|false (only stamp when explicitly set so the
517        // rule can distinguish "absent" from "false")
518        if let Some(af) = job_map.get("allow_failure").and_then(|v| v.as_bool()) {
519            step_meta.insert(META_GITLAB_ALLOW_FAILURE.into(), af.to_string());
520        } else if job_map
521            .get("allow_failure")
522            .and_then(|v| v.as_mapping())
523            .is_some()
524        {
525            // `allow_failure: { exit_codes: [42] }` — conditional pass; treat
526            // as truthy for silent-skip detection.
527            step_meta.insert(META_GITLAB_ALLOW_FAILURE.into(), "true".into());
528        }
529        // dind sidecar detection: any service whose name matches docker:*-dind
530        if job_services_have_dind(job_map.get("services")) {
531            step_meta.insert(META_GITLAB_DIND_SERVICE.into(), "true".into());
532        }
533        // trigger: block — child / downstream pipeline
534        if let Some(kind) = classify_trigger(job_map.get("trigger")) {
535            step_meta.insert(META_GITLAB_TRIGGER_KIND.into(), kind.into());
536        }
537        // cache: structural capture (key + policy)
538        if let Some((cache_key, cache_policy)) = extract_cache_key_policy(job_map.get("cache")) {
539            step_meta.insert(META_GITLAB_CACHE_KEY.into(), cache_key);
540            if let Some(p) = cache_policy {
541                step_meta.insert(META_GITLAB_CACHE_POLICY.into(), p);
542            }
543        }
544        if protected_only {
545            step_meta.insert(META_RULES_PROTECTED_ONLY.into(), "true".into());
546        }
547        let step_id = graph.add_node_with_metadata(
548            NodeKind::Step,
549            job_name,
550            TrustZone::FirstParty,
551            step_meta,
552        );
553
554        // CI_JOB_TOKEN always available to every step
555        graph.add_edge(step_id, token_id, EdgeKind::HasAccessTo);
556
557        // Link all secrets
558        for &sid in global_secrets
559            .iter()
560            .chain(&job_secrets)
561            .chain(&explicit_secrets)
562        {
563            graph.add_edge(step_id, sid, EdgeKind::HasAccessTo);
564        }
565
566        // Link OIDC identities
567        for &iid in &oidc_identities {
568            graph.add_edge(step_id, iid, EdgeKind::HasAccessTo);
569        }
570
571        // UsesImage edges
572        if let Some(img_id) = image_id {
573            graph.add_edge(step_id, img_id, EdgeKind::UsesImage);
574        }
575        for &svc_id in &service_ids {
576            graph.add_edge(step_id, svc_id, EdgeKind::UsesImage);
577        }
578    }
579
580    // Cross-platform misclassification trap (red-team R2 #5): a YAML file
581    // with non-reserved top-level keys looks like a GitLab pipeline shape
582    // but its body may use constructs the GitLab parser doesn't recognise
583    // (e.g. an ADO `task:` payload). Mark Partial when the source had at
584    // least one job-shaped top-level key but we ended up with no Step
585    // nodes — better than silently returning completeness=complete on a
586    // clean-but-empty graph that a CI gate would treat as "passed".
587    let step_count = graph
588        .nodes
589        .iter()
590        .filter(|n| n.kind == NodeKind::Step)
591        .count();
592    let had_job_carrier = mapping.iter().any(|(k, v)| {
593        k.as_str()
594            .map(|name| !RESERVED.contains(&name) && !name.starts_with('.'))
595            .unwrap_or(false)
596            && v.as_mapping().is_some()
597    });
598    if step_count == 0 && had_job_carrier {
599        graph.mark_partial(
600                GapKind::Opaque,
601                "non-reserved top-level keys parsed but produced 0 step nodes — possible non-GitLab YAML wrong-platform-classified".to_string(),
602            );
603    }
604
605    graph.stamp_edge_authority_summaries();
606    Ok(graph)
607}
608/// Detect `image:` string from a YAML value — can be a bare string or a mapping with `name:`.
609fn extract_image_str(v: &Value) -> Option<String> {
610    match v {
611        Value::String(s) => Some(s.clone()),
612        Value::Mapping(m) => m.get("name").and_then(|n| n.as_str()).map(String::from),
613        _ => None,
614    }
615}
616
617/// Extract environment name from `environment:` value (string or mapping).
618fn extract_environment_name(v: &Value) -> Option<String> {
619    match v {
620        Value::String(s) => Some(s.clone()),
621        Value::Mapping(m) => m.get("name").and_then(|n| n.as_str()).map(String::from),
622        _ => None,
623    }
624}
625
626/// Extract `environment:url:` value (only present when environment is a mapping).
627fn extract_environment_url(v: &Value) -> Option<String> {
628    match v {
629        Value::Mapping(m) => m.get("url").and_then(|u| u.as_str()).map(String::from),
630        _ => None,
631    }
632}
633
634/// Concatenate `before_script`, `script`, and `after_script` of a job into one
635/// string body (separated by newlines). Each section may be a single string or
636/// a list of strings. Empty sections are skipped.
637fn extract_script_body(job_map: &serde_yaml::Mapping) -> String {
638    let mut lines: Vec<String> = Vec::new();
639    for key in &["before_script", "script", "after_script"] {
640        if let Some(v) = job_map.get(*key) {
641            collect_script_lines(v, &mut lines);
642        }
643    }
644    lines.join("\n")
645}
646
647/// Append script lines from a YAML value (string or sequence of strings).
648fn collect_script_lines(v: &Value, out: &mut Vec<String>) {
649    match v {
650        Value::String(s) => out.push(s.clone()),
651        Value::Sequence(seq) => {
652            for item in seq {
653                if let Some(s) = item.as_str() {
654                    out.push(s.to_string());
655                }
656            }
657        }
658        _ => {}
659    }
660}
661
662/// Extract `artifacts.reports.dotenv` filename. Value may be a single string
663/// or a list of strings — for the list form we join with `,`.
664fn extract_dotenv_file(job_map: &serde_yaml::Mapping) -> Option<String> {
665    let dotenv = job_map
666        .get("artifacts")?
667        .as_mapping()?
668        .get("reports")?
669        .as_mapping()?
670        .get("dotenv")?;
671    match dotenv {
672        Value::String(s) => Some(s.clone()),
673        Value::Sequence(seq) => {
674            let parts: Vec<String> = seq
675                .iter()
676                .filter_map(|v| v.as_str().map(String::from))
677                .collect();
678            if parts.is_empty() {
679                None
680            } else {
681                Some(parts.join(","))
682            }
683        }
684        _ => None,
685    }
686}
687
688/// Extract upstream job names from `needs:` and `dependencies:`.
689/// `needs:` may be a list of strings or a list of mappings with `job:`.
690/// `dependencies:` is a list of strings.
691///
692/// F5: GitLab `needs:` entries support an `artifacts: false` opt-out that
693/// stops the upstream's artifacts (including its `dotenv` report) from
694/// flowing into this job. Excluding those entries here means the comma-joined
695/// `META_NEEDS` consumed by `dotenv_artifact_flows_to_privileged_deployment`
696/// only contains jobs whose artifacts genuinely flow — no rule-side change
697/// needed.
698fn extract_needs(job_map: &serde_yaml::Mapping) -> Vec<String> {
699    let mut out: Vec<String> = Vec::new();
700    if let Some(needs) = job_map.get("needs").and_then(|v| v.as_sequence()) {
701        for item in needs {
702            match item {
703                Value::String(s) => out.push(s.clone()),
704                Value::Mapping(m) => {
705                    let Some(j) = m.get("job").and_then(|j| j.as_str()) else {
706                        continue;
707                    };
708                    // `artifacts:` defaults to true when omitted. Only skip
709                    // when explicitly set to false — anything else (true,
710                    // missing, weird shape) keeps the dependency.
711                    let artifacts_disabled =
712                        m.get("artifacts").and_then(|v| v.as_bool()) == Some(false);
713                    if artifacts_disabled {
714                        continue;
715                    }
716                    out.push(j.to_string());
717                }
718                _ => {}
719            }
720        }
721    }
722    if let Some(deps) = job_map.get("dependencies").and_then(|v| v.as_sequence()) {
723        for item in deps {
724            if let Some(s) = item.as_str() {
725                out.push(s.to_string());
726            }
727        }
728    }
729    out.sort();
730    out.dedup();
731    out
732}
733
734/// Recognise the canonical "is `var` truthy?" shape inside a GitLab CI
735/// `rules: if:` expression. Returns:
736///
737/// * `Some(true)` — the expression positively asserts `var` is truthy
738///   (e.g. `$VAR == "true"`, `$VAR == true`, bare `$VAR`, or any of those
739///   joined to other clauses with `&&`).
740/// * `Some(false)` — the expression negates `var`'s truthiness
741///   (e.g. `$VAR != "true"`, `$VAR == "false"`, `$VAR == null`).
742/// * `None` — the shape isn't recognisable; caller MUST treat as "no positive
743///   signal" (i.e. do not stamp protected-only or merge_request_event metadata).
744///
745/// We deliberately keep this minimal — better to under-claim protection than
746/// over-claim it. Anything we don't understand returns `None`.
747///
748/// Boundary discipline: `var` matches only when it appears as a `$VAR` token
749/// surrounded by non-identifier chars (or string ends), so `$CI_COMMIT_TAG`
750/// does not silently match `$CI_COMMIT_TAG_MESSAGE`.
751fn check_truthy_comparison(expr: &str, var: &str) -> Option<bool> {
752    // Split on `||` first — if ANY top-level disjunct is positive, the
753    // whole expression is positive (any one matching clause makes the rule
754    // fire). For `&&`, all conjuncts must agree; if any conjunct contradicts
755    // the others, we fall back to None.
756    let trimmed = expr.trim();
757    if trimmed.is_empty() {
758        return None;
759    }
760
761    // Top-level `||` short-circuit: if any disjunct is positive, accept.
762    if let Some((lhs, rhs)) = split_top_level(trimmed, "||") {
763        let l = check_truthy_comparison(&lhs, var);
764        let r = check_truthy_comparison(&rhs, var);
765        return match (l, r) {
766            (Some(true), _) | (_, Some(true)) => Some(true),
767            (Some(false), Some(false)) => Some(false),
768            _ => None,
769        };
770    }
771    // Top-level `&&`: positive only if at least one conjunct is positive
772    // and none is explicitly negative. (A conjunct that doesn't mention
773    // `var` is None — neutral — so we treat it as non-blocking.)
774    if let Some((lhs, rhs)) = split_top_level(trimmed, "&&") {
775        let l = check_truthy_comparison(&lhs, var);
776        let r = check_truthy_comparison(&rhs, var);
777        return match (l, r) {
778            (Some(false), _) | (_, Some(false)) => Some(false),
779            (Some(true), _) | (_, Some(true)) => Some(true),
780            _ => None,
781        };
782    }
783
784    // No top-level boolean op — atomic comparison or bare reference.
785    classify_atom(trimmed, var)
786}
787
788/// Split `expr` at the first top-level (paren-depth zero, not inside a string)
789/// occurrence of `op`. Returns the left and right halves (without `op`).
790/// Returns `None` if `op` is not found at the top level.
791fn split_top_level(expr: &str, op: &str) -> Option<(String, String)> {
792    let bytes = expr.as_bytes();
793    let op_bytes = op.as_bytes();
794    let mut depth: i32 = 0;
795    let mut in_str: Option<u8> = None;
796    let mut in_regex = false;
797    let mut i = 0;
798    while i < bytes.len() {
799        let b = bytes[i];
800        // Track string literals (single + double quotes).
801        if let Some(q) = in_str {
802            if b == b'\\' && i + 1 < bytes.len() {
803                i += 2;
804                continue;
805            }
806            if b == q {
807                in_str = None;
808            }
809            i += 1;
810            continue;
811        }
812        if in_regex {
813            if b == b'\\' && i + 1 < bytes.len() {
814                i += 2;
815                continue;
816            }
817            if b == b'/' {
818                in_regex = false;
819            }
820            i += 1;
821            continue;
822        }
823        match b {
824            b'"' | b'\'' => {
825                in_str = Some(b);
826                i += 1;
827                continue;
828            }
829            b'/' => {
830                // A `/` after `=~` or `!~` starts a regex literal. Only enter
831                // regex mode when preceded (after whitespace) by `~`.
832                let mut j = i;
833                while j > 0 && bytes[j - 1].is_ascii_whitespace() {
834                    j -= 1;
835                }
836                if j > 0 && bytes[j - 1] == b'~' {
837                    in_regex = true;
838                    i += 1;
839                    continue;
840                }
841            }
842            b'(' => depth += 1,
843            b')' => depth -= 1,
844            _ => {}
845        }
846        if depth == 0
847            && i + op_bytes.len() <= bytes.len()
848            && &bytes[i..i + op_bytes.len()] == op_bytes
849        {
850            let lhs = expr[..i].to_string();
851            let rhs = expr[i + op_bytes.len()..].to_string();
852            return Some((lhs, rhs));
853        }
854        i += 1;
855    }
856    None
857}
858
859/// Classify an atomic (no `&&`/`||`) sub-expression against `var`.
860fn classify_atom(atom: &str, var: &str) -> Option<bool> {
861    let s = atom.trim().trim_matches('(').trim_matches(')').trim();
862    // Bare reference: the entire atom is `$VAR` (truthy iff variable is set
863    // and non-empty per GitLab semantics).
864    if s == var {
865        return Some(true);
866    }
867    // Look for `==` / `!=` and a literal RHS. Anything else (regex `=~`,
868    // arbitrary substring, multiple comparisons) → None.
869    let (op, lhs, rhs) = if let Some((l, r)) = s.split_once("==") {
870        ("==", l.trim(), r.trim())
871    } else if let Some((l, r)) = s.split_once("!=") {
872        ("!=", l.trim(), r.trim())
873    } else {
874        return None;
875    };
876    // The variable must appear on exactly one side; the other side is the
877    // literal we compare against.
878    let (lit, side_is_var) = if lhs == var {
879        (rhs, true)
880    } else if rhs == var {
881        (lhs, true)
882    } else {
883        // Neither side is the variable as a bare token — recognise also a
884        // few extremely common forms where the var has surrounding chars
885        // (e.g. quoted: `"$VAR" == "true"`) but otherwise bail.
886        let lhs_unq = lhs.trim_matches('"').trim_matches('\'');
887        let rhs_unq = rhs.trim_matches('"').trim_matches('\'');
888        if lhs_unq == var {
889            (rhs, true)
890        } else if rhs_unq == var {
891            (lhs, true)
892        } else {
893            return None;
894        }
895    };
896    let _ = side_is_var; // currently always true if we got here
897                         // Normalise the literal: strip optional surrounding quotes.
898    let lit_norm = lit
899        .trim_matches('"')
900        .trim_matches('\'')
901        .to_ascii_lowercase();
902    let truthy_lit = matches!(lit_norm.as_str(), "true" | "1");
903    let falsy_lit = matches!(lit_norm.as_str(), "false" | "null" | "" | "0");
904    match (op, truthy_lit, falsy_lit) {
905        ("==", true, _) => Some(true),
906        ("==", _, true) => Some(false),
907        ("!=", true, _) => Some(false),
908        ("!=", _, true) => Some(true),
909        // Comparison against an arbitrary string literal (e.g. a branch name
910        // for `$CI_COMMIT_BRANCH == "main"`) is not a truthy comparison —
911        // return None and let the caller fall through to other heuristics.
912        _ => None,
913    }
914}
915
916/// Classify a variable name as a credential by checking for common fragments.
917///
918/// Each fragment in `CRED_FRAGMENTS` must appear as a *segment* of the name
919/// (bounded by `_` or by the start/end of the string), NOT as a free-floating
920/// substring. This avoids false positives like `CERTAIN_FLAG` (matches `CERT`
921/// substring), `CERTIFICATE_PATH` (path config, not a credential),
922/// `TOKENIZER_VERSION` (matches `TOKEN`), and `UNCERTAIN`.
923///
924/// A multi-token fragment like `PRIVATE_KEY` matches when its full text appears
925/// at a segment boundary on both sides — i.e. surrounded by `_` or string ends.
926fn is_credential_name(name: &str) -> bool {
927    let upper = name.to_uppercase();
928    let bytes = upper.as_bytes();
929    CRED_FRAGMENTS.iter().any(|frag| {
930        let frag_bytes = frag.as_bytes();
931        let n = frag_bytes.len();
932        if bytes.len() < n {
933            return false;
934        }
935        // Slide the fragment across the name, accepting only segment-bounded matches.
936        for i in 0..=bytes.len() - n {
937            if &bytes[i..i + n] != frag_bytes {
938                continue;
939            }
940            let left_ok = i == 0 || bytes[i - 1] == b'_';
941            let right_ok = i + n == bytes.len() || bytes[i + n] == b'_';
942            if left_ok && right_ok {
943                return true;
944            }
945        }
946        false
947    })
948}
949
950/// Parse `variables:` mapping and emit `Secret` nodes for credential-pattern names.
951/// Returns the list of created node IDs.
952fn process_variables(vars: Option<&Value>, graph: &mut AuthorityGraph, scope: &str) -> Vec<NodeId> {
953    let mut ids = Vec::new();
954    let map = match vars.and_then(|v| v.as_mapping()) {
955        Some(m) => m,
956        None => return ids,
957    };
958    // determinism: sort by key — same YAML must produce same NodeId order
959    let mut entries: Vec<(&Value, &Value)> = map.iter().collect();
960    entries.sort_by(|a, b| a.0.as_str().unwrap_or("").cmp(b.0.as_str().unwrap_or("")));
961    for (k, _v) in entries {
962        let name = match k.as_str() {
963            Some(s) => s,
964            None => continue,
965        };
966        if is_credential_name(name) {
967            let id = graph.add_node(NodeKind::Secret, name, TrustZone::FirstParty);
968            ids.push(id);
969            let _ = scope; // used for future scoped error messages
970        }
971    }
972    ids
973}
974
975/// Parse `secrets:` block and emit one `Secret` node per named secret.
976///
977/// GitLab CI `secrets:` format:
978/// ```yaml
979/// secrets:
980///   DATABASE_PASSWORD:
981///     vault: production/db/password@secret
982///   AWS_KEY:
983///     aws_secrets_manager:
984///       name: my-secret
985/// ```
986fn process_explicit_secrets(
987    secrets: Option<&Value>,
988    _scope: &str,
989    graph: &mut AuthorityGraph,
990) -> Vec<NodeId> {
991    let mut ids = Vec::new();
992    let map = match secrets.and_then(|v| v.as_mapping()) {
993        Some(m) => m,
994        None => return ids,
995    };
996    // determinism: sort by key — same YAML must produce same NodeId order
997    let mut entries: Vec<(&Value, &Value)> = map.iter().collect();
998    entries.sort_by(|a, b| a.0.as_str().unwrap_or("").cmp(b.0.as_str().unwrap_or("")));
999    for (k, _v) in entries {
1000        let name = match k.as_str() {
1001            Some(s) => s,
1002            None => continue,
1003        };
1004        let id = graph.add_node(NodeKind::Secret, name, TrustZone::FirstParty);
1005        ids.push(id);
1006    }
1007    ids
1008}
1009
1010/// Parse `id_tokens:` block and emit one OIDC `Identity` node per token.
1011///
1012/// GitLab CI `id_tokens:` format:
1013/// ```yaml
1014/// id_tokens:
1015///   SIGSTORE_ID_TOKEN:
1016///     aud: sigstore
1017///   AWS_OIDC_TOKEN:
1018///     aud: https://sts.amazonaws.com
1019/// ```
1020fn process_id_tokens(
1021    id_tokens: Option<&Value>,
1022    _scope: &str,
1023    graph: &mut AuthorityGraph,
1024) -> Vec<NodeId> {
1025    let mut ids = Vec::new();
1026    let map = match id_tokens.and_then(|v| v.as_mapping()) {
1027        Some(m) => m,
1028        None => return ids,
1029    };
1030    // determinism: sort by key — same YAML must produce same NodeId order
1031    let mut entries: Vec<(&Value, &Value)> = map.iter().collect();
1032    entries.sort_by(|a, b| a.0.as_str().unwrap_or("").cmp(b.0.as_str().unwrap_or("")));
1033    for (k, v) in entries {
1034        let token_name = match k.as_str() {
1035            Some(s) => s,
1036            None => continue,
1037        };
1038        // F3: GitLab supports list-form `aud: [a, b, c]` (multi-cloud broker —
1039        // strongest over-scoping signal). Previously `as_str()` on a sequence
1040        // returned None and we fell through to "unknown", silently blinding
1041        // every multi-aud rule. Handle both shapes explicitly.
1042        let aud_value = v.as_mapping().and_then(|m| m.get("aud"));
1043        let (aud_joined, is_list) = match aud_value {
1044            Some(Value::String(s)) => (s.clone(), false),
1045            Some(Value::Sequence(seq)) => {
1046                let parts: Vec<String> = seq
1047                    .iter()
1048                    .filter_map(|item| match item {
1049                        Value::String(s) => Some(s.clone()),
1050                        _ => None,
1051                    })
1052                    .collect();
1053                if parts.is_empty() {
1054                    ("unknown".into(), false)
1055                } else {
1056                    (parts.join(","), true)
1057                }
1058            }
1059            _ => ("unknown".into(), false),
1060        };
1061        let label = format!("{token_name} (aud={aud_joined})");
1062        let mut meta = HashMap::new();
1063        meta.insert(META_OIDC.into(), "true".into());
1064        meta.insert(META_IDENTITY_SCOPE.into(), "broad".into());
1065        // Backward-compat: keep the single-`aud` field populated. For the
1066        // list form it now holds the comma-joined string so existing
1067        // consumers see *something* rather than "unknown".
1068        meta.insert(META_OIDC_AUDIENCE.into(), aud_joined.clone());
1069        // New (F3): explicit "list form" marker. Only set on the multi-aud
1070        // path so downstream rules can distinguish single-aud vs multi-aud
1071        // configurations without parsing the comma-joined string.
1072        if is_list {
1073            meta.insert(META_OIDC_AUDIENCES.into(), aud_joined.clone());
1074        }
1075        let id =
1076            graph.add_node_with_metadata(NodeKind::Identity, label, TrustZone::FirstParty, meta);
1077        ids.push(id);
1078    }
1079    ids
1080}
1081
1082/// Parse `services:` block and emit `Image` nodes.
1083fn process_services(services: Option<&Value>, graph: &mut AuthorityGraph) -> Vec<NodeId> {
1084    let mut ids = Vec::new();
1085    let list = match services.and_then(|v| v.as_sequence()) {
1086        Some(s) => s,
1087        None => return ids,
1088    };
1089    for item in list {
1090        let img_str = match extract_image_str(item) {
1091            Some(s) => s,
1092            None => continue,
1093        };
1094        let pinned = is_docker_digest_pinned(&img_str);
1095        let trust_zone = if pinned {
1096            TrustZone::ThirdParty
1097        } else {
1098            TrustZone::Untrusted
1099        };
1100        let mut meta = HashMap::new();
1101        if let Some(digest) = img_str.split("@sha256:").nth(1) {
1102            meta.insert(META_DIGEST.into(), format!("sha256:{digest}"));
1103        }
1104        let id = graph.add_node_with_metadata(NodeKind::Image, &img_str, trust_zone, meta);
1105        ids.push(id);
1106    }
1107    ids
1108}
1109
1110/// Check whether a job's `rules:` or `only:` indicates it runs on merge requests.
1111fn job_has_mr_trigger(job_map: &serde_yaml::Mapping) -> bool {
1112    // rules: [{if: '$CI_PIPELINE_SOURCE == "merge_request_event"'}]
1113    if let Some(rules) = job_map.get("rules").and_then(|v| v.as_sequence()) {
1114        for rule in rules {
1115            if let Some(if_expr) = rule
1116                .as_mapping()
1117                .and_then(|m| m.get("if"))
1118                .and_then(|v| v.as_str())
1119            {
1120                // F2: MR-trigger only fires for the *positive* equality form.
1121                // `$CI_PIPELINE_SOURCE != "merge_request_event"` ("run except
1122                // on MRs") used to set META_TRIGGER=merge_request and pollute
1123                // every downstream MR-context rule.
1124                if matches_mr_event(if_expr) {
1125                    return true;
1126                }
1127            }
1128        }
1129    }
1130    // only: [merge_requests] or only: {refs: [merge_requests]}
1131    if let Some(only) = job_map.get("only") {
1132        if only_has_merge_requests(only) {
1133            return true;
1134        }
1135    }
1136    false
1137}
1138
1139/// Check `only:` value (sequence or mapping) for `merge_requests` entry.
1140fn only_has_merge_requests(v: &Value) -> bool {
1141    match v {
1142        Value::Sequence(seq) => seq
1143            .iter()
1144            .any(|item| item.as_str() == Some("merge_requests")),
1145        Value::Mapping(m) => {
1146            if let Some(refs) = m.get("refs").and_then(|r| r.as_sequence()) {
1147                return refs
1148                    .iter()
1149                    .any(|item| item.as_str() == Some("merge_requests"));
1150            }
1151            false
1152        }
1153        _ => false,
1154    }
1155}
1156
1157/// Returns true when a job's `rules:` or `only:` clause restricts execution
1158/// to protected refs only. The set of accepted patterns is intentionally
1159/// generous because the goal is to *credit* defensive intent, not to
1160/// audit-grade verify that every protection actually exists in GitLab's
1161/// branch-protection settings — that lives outside the YAML.
1162///
1163/// Patterns recognised as a protected-only restriction:
1164///
1165///   * any `rules: [{ if: ... $CI_COMMIT_REF_PROTECTED ... }]`
1166///   * any `rules: [{ if: ... $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH ... }]`
1167///     (default branch is GitLab-protected by default)
1168///   * any `rules: [{ if: ... $CI_COMMIT_TAG ... }]` (tags are protected by default)
1169///   * `only: [main]` / `only: [master]` / `only: tags`
1170///   * `only: { refs: [main, /^release/.*/] }`
1171///
1172/// Hits any one of the above → true. Misses every one → false.
1173fn job_has_protected_branch_restriction(job_map: &serde_yaml::Mapping) -> bool {
1174    if let Some(rules) = job_map.get("rules").and_then(|v| v.as_sequence()) {
1175        for rule in rules {
1176            let Some(if_expr) = rule
1177                .as_mapping()
1178                .and_then(|m| m.get("if"))
1179                .and_then(|v| v.as_str())
1180            else {
1181                continue;
1182            };
1183            // F1: `$CI_COMMIT_REF_PROTECTED` — only a *positive* assertion
1184            // ("ref IS protected") counts. `== "false"` or `!= "true"` is the
1185            // exact opposite signal and must NOT stamp protected-only.
1186            if matches!(
1187                check_truthy_comparison(if_expr, "$CI_COMMIT_REF_PROTECTED"),
1188                Some(true)
1189            ) {
1190                return true;
1191            }
1192            if if_expr.contains("$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH")
1193                || if_expr.contains("$CI_DEFAULT_BRANCH == $CI_COMMIT_BRANCH")
1194            {
1195                return true;
1196            }
1197            // F1: `$CI_COMMIT_TAG` — only the truthy form ("running on a
1198            // tag", which GitLab protects by default). Reject negations
1199            // (`== null`, `!= ...`) and avoid the substring-collision with
1200            // `$CI_COMMIT_TAG_MESSAGE` that the previous `contains()` had.
1201            if matches!(
1202                check_truthy_comparison(if_expr, "$CI_COMMIT_TAG"),
1203                Some(true)
1204            ) {
1205                return true;
1206            }
1207        }
1208    }
1209    if let Some(only) = job_map.get("only") {
1210        if only_lists_protected_ref(only) {
1211            return true;
1212        }
1213    }
1214    false
1215}
1216
1217/// Check `only:` for protected/default-branch refs (`main`, `master`, `tags`,
1218/// or a `refs:` list containing those). Conservative — does NOT include
1219/// `merge_requests` (that's the opposite signal).
1220fn only_lists_protected_ref(v: &Value) -> bool {
1221    fn is_protected_ref(s: &str) -> bool {
1222        matches!(s, "main" | "master" | "tags") || s.starts_with("/^release")
1223    }
1224    match v {
1225        Value::String(s) => is_protected_ref(s.as_str()),
1226        Value::Sequence(seq) => seq
1227            .iter()
1228            .any(|item| item.as_str().map(is_protected_ref).unwrap_or(false)),
1229        Value::Mapping(m) => {
1230            if let Some(refs) = m.get("refs").and_then(|r| r.as_sequence()) {
1231                return refs
1232                    .iter()
1233                    .any(|item| item.as_str().map(is_protected_ref).unwrap_or(false));
1234            }
1235            false
1236        }
1237        _ => false,
1238    }
1239}
1240
1241/// Check top-level `workflow:` rules for MR trigger.
1242fn has_mr_trigger_in_workflow(wf: &Value) -> bool {
1243    let rules = match wf
1244        .as_mapping()
1245        .and_then(|m| m.get("rules"))
1246        .and_then(|r| r.as_sequence())
1247    {
1248        Some(r) => r,
1249        None => return false,
1250    };
1251    for rule in rules {
1252        if let Some(if_expr) = rule
1253            .as_mapping()
1254            .and_then(|m| m.get("if"))
1255            .and_then(|v| v.as_str())
1256        {
1257            // F2: see `job_has_mr_trigger` — only the positive equality form
1258            // counts; negations are rejected.
1259            if matches_mr_event(if_expr) {
1260                return true;
1261            }
1262        }
1263    }
1264    false
1265}
1266
1267fn workflow_rules_define_variables(wf: &Value) -> bool {
1268    wf.as_mapping()
1269        .and_then(|m| m.get("rules"))
1270        .is_some_and(|rules| rules_define_variables(Some(rules)))
1271}
1272
1273fn rules_define_variables(rules: Option<&Value>) -> bool {
1274    let Some(rules) = rules.and_then(|v| v.as_sequence()) else {
1275        return false;
1276    };
1277    rules
1278        .iter()
1279        .filter_map(|rule| rule.as_mapping())
1280        .any(|rule| rule.contains_key("variables"))
1281}
1282
1283/// Returns true when `if_expr` positively asserts that the pipeline source IS
1284/// `merge_request_event`. Accepts `$CI_PIPELINE_SOURCE == "merge_request_event"`
1285/// (and quoted/`||`/`&&` variants) at the truthy-comparison level. Rejects the
1286/// `!=` negation form. Falls back to `false` for anything we can't parse — the
1287/// caller always treats that as "no MR trigger detected".
1288fn matches_mr_event(if_expr: &str) -> bool {
1289    // We don't have a `var == "merge_request_event"` pseudo-variable, so we
1290    // synthesise one: split on `||` ourselves and look for any disjunct that
1291    // is exactly `$CI_PIPELINE_SOURCE == "merge_request_event"` (with
1292    // tolerable whitespace and quoting variations).
1293    fn atom_is_mr_event(atom: &str) -> bool {
1294        let s = atom.trim().trim_matches('(').trim_matches(')').trim();
1295        let (lhs, rhs) = match s.split_once("==") {
1296            Some(parts) => parts,
1297            None => return false,
1298        };
1299        let lhs = lhs.trim();
1300        let rhs_norm = rhs.trim().trim_matches('"').trim_matches('\'');
1301        // Either side may carry the variable; the other must equal the literal.
1302        let lhs_unq = lhs.trim_matches('"').trim_matches('\'');
1303        let rhs_raw = rhs.trim().trim_matches('"').trim_matches('\'');
1304        if (lhs_unq == "$CI_PIPELINE_SOURCE" && rhs_norm == "merge_request_event")
1305            || (rhs_raw == "$CI_PIPELINE_SOURCE" && lhs_unq == "merge_request_event")
1306        {
1307            return true;
1308        }
1309        false
1310    }
1311    let trimmed = if_expr.trim();
1312    // Top-level `||` short-circuit: any positive disjunct wins.
1313    if let Some((lhs, rhs)) = split_top_level(trimmed, "||") {
1314        return atom_is_mr_event(&lhs) || matches_mr_event(&rhs);
1315    }
1316    // For `&&`, accept if any conjunct is a positive `merge_request_event`
1317    // comparison. We don't try to detect contradictory conjuncts —
1318    // `merge_request_event` is a string literal, not a boolean, so the
1319    // truthiness short-circuiting in `check_truthy_comparison` doesn't apply.
1320    if let Some((lhs, rhs)) = split_top_level(trimmed, "&&") {
1321        return atom_is_mr_event(&lhs) || matches_mr_event(&rhs);
1322    }
1323    atom_is_mr_event(trimmed)
1324}
1325
1326/// Structured representation of a single `include:` entry.
1327///
1328/// Serialised into `AuthorityGraph::metadata[META_GITLAB_INCLUDES]` so that
1329/// downstream rules (e.g. `unpinned_include_remote_or_branch_ref`) can analyse
1330/// remote-URL pins, project refs, and missing `ref:` defaults without re-parsing
1331/// the YAML.
1332#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
1333pub struct IncludeEntry {
1334    /// Include kind: `local`, `remote`, `template`, `project`, `component`, or
1335    /// `unknown` for shapes we don't recognise.
1336    pub kind: String,
1337    /// The path / URL / project string the include points at.
1338    pub target: String,
1339    /// The resolved `ref:` value. Empty string when the include omits a `ref:`
1340    /// (defaults to HEAD on the source repo, which is itself a finding).
1341    pub git_ref: String,
1342}
1343
1344/// Parse the top-level `include:` value into a flat list of `IncludeEntry`s.
1345///
1346/// `include:` accepts five shapes — string, sequence-of-strings, sequence-of-mappings,
1347/// sequence-of-strings-mixed-with-mappings, and a single mapping. Normalise all of
1348/// them into one flat list so the rule layer doesn't have to.
1349pub fn extract_include_entries(v: &Value) -> Vec<IncludeEntry> {
1350    let mut out = Vec::new();
1351    match v {
1352        // `include: 'path/to/local.yml'` — sugar for a local include
1353        Value::String(s) => {
1354            out.push(IncludeEntry {
1355                kind: classify_string_include(s).into(),
1356                target: s.clone(),
1357                git_ref: String::new(),
1358            });
1359        }
1360        Value::Sequence(seq) => {
1361            for item in seq {
1362                match item {
1363                    Value::String(s) => {
1364                        out.push(IncludeEntry {
1365                            kind: classify_string_include(s).into(),
1366                            target: s.clone(),
1367                            git_ref: String::new(),
1368                        });
1369                    }
1370                    Value::Mapping(m) => {
1371                        if let Some(e) = include_entry_from_mapping(m) {
1372                            out.push(e);
1373                        }
1374                    }
1375                    _ => {}
1376                }
1377            }
1378        }
1379        Value::Mapping(m) => {
1380            if let Some(e) = include_entry_from_mapping(m) {
1381                out.push(e);
1382            }
1383        }
1384        _ => {}
1385    }
1386    out
1387}
1388
1389/// Heuristic: a top-level `include:` string that looks like an HTTPS URL is a
1390/// `remote:` include in shorthand form; everything else is a `local:` path.
1391fn classify_string_include(s: &str) -> &'static str {
1392    let lower = s.to_ascii_lowercase();
1393    if lower.starts_with("http://") || lower.starts_with("https://") {
1394        "remote"
1395    } else {
1396        "local"
1397    }
1398}
1399
1400/// Lift one of the four mapping forms (`local:`, `remote:`, `template:`,
1401/// `project:`, `component:`) into an `IncludeEntry`. Returns None when the
1402/// mapping has none of the recognised keys.
1403fn include_entry_from_mapping(m: &serde_yaml::Mapping) -> Option<IncludeEntry> {
1404    let str_at = |key: &str| {
1405        m.get(key)
1406            .and_then(|v| v.as_str())
1407            .map(str::to_string)
1408            .unwrap_or_default()
1409    };
1410    if let Some(s) = m.get("local").and_then(|v| v.as_str()) {
1411        return Some(IncludeEntry {
1412            kind: "local".into(),
1413            target: s.to_string(),
1414            git_ref: String::new(),
1415        });
1416    }
1417    if let Some(s) = m.get("remote").and_then(|v| v.as_str()) {
1418        return Some(IncludeEntry {
1419            kind: "remote".into(),
1420            target: s.to_string(),
1421            git_ref: String::new(),
1422        });
1423    }
1424    if let Some(s) = m.get("template").and_then(|v| v.as_str()) {
1425        return Some(IncludeEntry {
1426            kind: "template".into(),
1427            target: s.to_string(),
1428            git_ref: String::new(),
1429        });
1430    }
1431    if let Some(s) = m.get("component").and_then(|v| v.as_str()) {
1432        // GitLab CI/CD components: source@version → version is the pin
1433        let (target, git_ref) = match s.rsplit_once('@') {
1434            Some((path, ver)) => (path.to_string(), ver.to_string()),
1435            None => (s.to_string(), String::new()),
1436        };
1437        return Some(IncludeEntry {
1438            kind: "component".into(),
1439            target,
1440            git_ref,
1441        });
1442    }
1443    if m.contains_key("project") {
1444        let project = str_at("project");
1445        // ref: may be missing → empty string indicates HEAD/default branch,
1446        // which is itself a supply-chain finding.
1447        let git_ref = str_at("ref");
1448        return Some(IncludeEntry {
1449            kind: "project".into(),
1450            target: project,
1451            git_ref,
1452        });
1453    }
1454    None
1455}
1456
1457/// Extract a flat list of template names from an `extends:` value.
1458/// `extends:` accepts a single string or a sequence of strings.
1459fn extract_extends_list(v: Option<&Value>) -> Vec<String> {
1460    let v = match v {
1461        Some(v) => v,
1462        None => return Vec::new(),
1463    };
1464    match v {
1465        Value::String(s) => vec![s.clone()],
1466        Value::Sequence(seq) => seq
1467            .iter()
1468            .filter_map(|i| i.as_str().map(str::to_string))
1469            .collect(),
1470        _ => Vec::new(),
1471    }
1472}
1473
1474/// Returns true when `default:` carries keys that can change authority
1475/// interpretation for jobs inheriting it.
1476fn default_contains_authority_relevant_keys(m: &serde_yaml::Mapping) -> bool {
1477    [
1478        "image",
1479        "services",
1480        "variables",
1481        "secrets",
1482        "id_tokens",
1483        "before_script",
1484        "after_script",
1485        "cache",
1486        "artifacts",
1487    ]
1488    .iter()
1489    .any(|k| m.contains_key(*k))
1490}
1491
1492/// Returns true when any entry in `services:` has an image name matching
1493/// `docker:*-dind` (or bare `docker:dind`). Recognises both shapes:
1494/// `services: [docker:dind]` and `services: [{name: docker:dind}]`.
1495fn job_services_have_dind(services: Option<&Value>) -> bool {
1496    let list = match services.and_then(|v| v.as_sequence()) {
1497        Some(s) => s,
1498        None => return false,
1499    };
1500    for item in list {
1501        let img = match extract_image_str(item) {
1502            Some(s) => s,
1503            None => continue,
1504        };
1505        if image_is_dind(&img) {
1506            return true;
1507        }
1508    }
1509    false
1510}
1511
1512/// Match `docker:dind`, `docker:24.0-dind`, `docker:24-dind`,
1513/// `docker:24.0.7-dind-rootless`, etc. The discriminator is a `docker:` prefix
1514/// AND `dind` appearing somewhere in the tag.
1515fn image_is_dind(image: &str) -> bool {
1516    let lower = image.to_ascii_lowercase();
1517    // Match the official docker dind images and their digest-pinned variants.
1518    // Strip any `@sha256:...` suffix before checking the tag.
1519    let bare = match lower.split_once('@') {
1520        Some((b, _)) => b,
1521        None => &lower,
1522    };
1523    if !bare.starts_with("docker:") && !bare.starts_with("docker/") {
1524        return false;
1525    }
1526    bare.contains("dind")
1527}
1528
1529/// Classify a `trigger:` block as either `static` (in-tree YAML / fixed
1530/// downstream project) or `dynamic` (include from a previous job's artifact —
1531/// dynamic child pipelines, the code-injection sink). Returns None when no
1532/// `trigger:` block is present.
1533fn classify_trigger(trigger: Option<&Value>) -> Option<&'static str> {
1534    let t = trigger?;
1535    // Shorthand: `trigger: my/downstream/project` → static
1536    if t.is_string() {
1537        return Some("static");
1538    }
1539    let m = t.as_mapping()?;
1540    // Look at every `include:` entry under trigger; if ANY one references an
1541    // `artifact:` field, the child pipeline is dynamic.
1542    if let Some(inc) = m.get("include") {
1543        if include_has_artifact_source(inc) {
1544            return Some("dynamic");
1545        }
1546    }
1547    Some("static")
1548}
1549
1550/// Walk a `trigger.include:` value (string / sequence / mapping) and return
1551/// true when any entry's mapping carries an `artifact:` key.
1552fn include_has_artifact_source(v: &Value) -> bool {
1553    match v {
1554        Value::Mapping(m) => m.contains_key("artifact"),
1555        Value::Sequence(seq) => seq.iter().any(|i| {
1556            i.as_mapping()
1557                .map(|m| m.contains_key("artifact"))
1558                .unwrap_or(false)
1559        }),
1560        _ => false,
1561    }
1562}
1563
1564/// Extract `(cache.key, cache.policy)` from a job's `cache:` value. Returns
1565/// `None` when no cache is declared. `cache:` may be a sequence of mappings
1566/// (multiple caches); we capture the first key/policy pair so the rule layer
1567/// has at least one signal — multi-cache analysis is left to a future
1568/// extension.
1569///
1570/// `cache.key:` may be:
1571/// - a string: `key: vendor`
1572/// - a mapping: `key: { files: [Gemfile.lock] }` → captured as `files:Gemfile.lock,...`
1573/// - a mapping with `prefix:` → captured as `prefix:<value>`
1574fn extract_cache_key_policy(v: Option<&Value>) -> Option<(String, Option<String>)> {
1575    let v = v?;
1576    let m = match v {
1577        Value::Mapping(m) => m,
1578        Value::Sequence(seq) => {
1579            // First cache wins — same heuristic used elsewhere.
1580            return seq
1581                .iter()
1582                .find_map(|i| i.as_mapping().and_then(extract_cache_key_policy_map));
1583        }
1584        _ => return None,
1585    };
1586    extract_cache_key_policy_map(m)
1587}
1588
1589fn extract_cache_key_policy_map(m: &serde_yaml::Mapping) -> Option<(String, Option<String>)> {
1590    let key = match m.get("key") {
1591        Some(Value::String(s)) => s.clone(),
1592        Some(Value::Number(n)) => n.to_string(),
1593        Some(Value::Bool(b)) => b.to_string(),
1594        Some(Value::Mapping(km)) => {
1595            let mut parts = Vec::new();
1596            if let Some(prefix) = km.get("prefix").and_then(|v| v.as_str()) {
1597                parts.push(format!("prefix:{prefix}"));
1598            }
1599            if let Some(files) = km.get("files").and_then(|v| v.as_sequence()) {
1600                let names: Vec<String> = files
1601                    .iter()
1602                    .filter_map(|f| f.as_str().map(str::to_string))
1603                    .collect();
1604                if !names.is_empty() {
1605                    parts.push(format!("files:{}", names.join(",")));
1606                }
1607            }
1608            if parts.is_empty() {
1609                String::new()
1610            } else {
1611                parts.join(";")
1612            }
1613        }
1614        _ => String::new(),
1615    };
1616    let policy = m.get("policy").and_then(|v| v.as_str()).map(str::to_string);
1617    Some((key, policy))
1618}
1619
1620#[cfg(test)]
1621mod tests {
1622    use super::*;
1623
1624    fn parse(yaml: &str) -> AuthorityGraph {
1625        let parser = GitlabParser;
1626        let source = PipelineSource {
1627            file: ".gitlab-ci.yml".into(),
1628            repo: None,
1629            git_ref: None,
1630            commit_sha: None,
1631        };
1632        parser.parse(yaml, &source).unwrap()
1633    }
1634
1635    #[test]
1636    fn ci_job_token_always_present() {
1637        let yaml = r#"
1638stages:
1639  - build
1640
1641build-job:
1642  stage: build
1643  script:
1644    - make build
1645"#;
1646        let graph = parse(yaml);
1647        let identities: Vec<_> = graph.nodes_of_kind(NodeKind::Identity).collect();
1648        assert_eq!(identities.len(), 1);
1649        assert_eq!(identities[0].name, "CI_JOB_TOKEN");
1650        assert_eq!(
1651            identities[0]
1652                .metadata
1653                .get(META_IMPLICIT)
1654                .map(String::as_str),
1655            Some("true")
1656        );
1657        assert_eq!(
1658            identities[0]
1659                .metadata
1660                .get(META_IDENTITY_SCOPE)
1661                .map(String::as_str),
1662            Some("broad")
1663        );
1664    }
1665
1666    #[test]
1667    fn global_credential_variable_emits_secret_node() {
1668        let yaml = r#"
1669variables:
1670  APP_VERSION: "1.0"
1671  DEPLOY_TOKEN: "$CI_DEPLOY_TOKEN"
1672
1673build-job:
1674  script:
1675    - make
1676"#;
1677        let graph = parse(yaml);
1678        let secrets: Vec<_> = graph.nodes_of_kind(NodeKind::Secret).collect();
1679        assert!(
1680            secrets.iter().any(|s| s.name == "DEPLOY_TOKEN"),
1681            "DEPLOY_TOKEN must emit a Secret node, got: {:?}",
1682            secrets.iter().map(|s| &s.name).collect::<Vec<_>>()
1683        );
1684        // Plain config variable must not emit Secret
1685        assert!(
1686            !secrets.iter().any(|s| s.name == "APP_VERSION"),
1687            "APP_VERSION must not emit a Secret node"
1688        );
1689    }
1690
1691    #[test]
1692    fn floating_image_emits_untrusted_image_node() {
1693        let yaml = r#"
1694deploy:
1695  image: alpine:latest
1696  script:
1697    - deploy.sh
1698"#;
1699        let graph = parse(yaml);
1700        let images: Vec<_> = graph.nodes_of_kind(NodeKind::Image).collect();
1701        assert_eq!(images.len(), 1);
1702        assert_eq!(images[0].name, "alpine:latest");
1703        assert_eq!(images[0].trust_zone, TrustZone::Untrusted);
1704    }
1705
1706    #[test]
1707    fn digest_pinned_image_is_third_party() {
1708        let yaml = r#"
1709deploy:
1710  image: "alpine@sha256:a5ac7e51b41094c92402da3b24376905380afc29a5ac7e51b41094c92402da3b"
1711  script:
1712    - deploy.sh
1713"#;
1714        let graph = parse(yaml);
1715        let images: Vec<_> = graph.nodes_of_kind(NodeKind::Image).collect();
1716        assert_eq!(images.len(), 1);
1717        assert_eq!(images[0].trust_zone, TrustZone::ThirdParty);
1718    }
1719
1720    #[test]
1721    fn id_tokens_emit_oidc_identity_nodes() {
1722        let yaml = r#"
1723deploy:
1724  id_tokens:
1725    SIGSTORE_ID_TOKEN:
1726      aud: sigstore
1727    AWS_OIDC_TOKEN:
1728      aud: https://sts.amazonaws.com
1729  script:
1730    - deploy.sh
1731"#;
1732        let graph = parse(yaml);
1733        let oidc: Vec<_> = graph
1734            .nodes_of_kind(NodeKind::Identity)
1735            .filter(|n| n.metadata.get(META_OIDC).map(String::as_str) == Some("true"))
1736            .collect();
1737        assert_eq!(
1738            oidc.len(),
1739            2,
1740            "expected 2 OIDC identity nodes, got: {:?}",
1741            oidc.iter().map(|n| &n.name).collect::<Vec<_>>()
1742        );
1743    }
1744
1745    #[test]
1746    fn explicit_secrets_emit_secret_nodes() {
1747        let yaml = r#"
1748deploy:
1749  secrets:
1750    DATABASE_PASSWORD:
1751      vault: production/db/password@secret
1752    AWS_KEY:
1753      aws_secrets_manager:
1754        name: my-secret
1755  script:
1756    - deploy.sh
1757"#;
1758        let graph = parse(yaml);
1759        let secrets: Vec<_> = graph.nodes_of_kind(NodeKind::Secret).collect();
1760        let names: Vec<_> = secrets.iter().map(|s| s.name.as_str()).collect();
1761        assert!(names.contains(&"DATABASE_PASSWORD"), "got: {names:?}");
1762        assert!(names.contains(&"AWS_KEY"), "got: {names:?}");
1763    }
1764
1765    #[test]
1766    fn rules_mr_trigger_sets_meta_trigger() {
1767        let yaml = r#"
1768test:
1769  rules:
1770    - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
1771  script:
1772    - run tests
1773"#;
1774        let graph = parse(yaml);
1775        assert_eq!(
1776            graph.metadata.get(META_TRIGGER).map(String::as_str),
1777            Some("merge_request"),
1778            "META_TRIGGER must be set to merge_request"
1779        );
1780    }
1781
1782    #[test]
1783    fn only_merge_requests_sets_meta_trigger() {
1784        let yaml = r#"
1785test:
1786  only:
1787    - merge_requests
1788  script:
1789    - run tests
1790"#;
1791        let graph = parse(yaml);
1792        assert_eq!(
1793            graph.metadata.get(META_TRIGGER).map(String::as_str),
1794            Some("merge_request")
1795        );
1796    }
1797
1798    #[test]
1799    fn include_marks_graph_partial() {
1800        let yaml = r#"
1801include:
1802  - local: '/templates/.base.yml'
1803
1804build:
1805  script:
1806    - make
1807"#;
1808        let graph = parse(yaml);
1809        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
1810        assert_eq!(graph.completeness_gap_kinds[0], GapKind::Structural);
1811    }
1812
1813    #[test]
1814    fn default_with_authority_relevant_keys_marks_partial() {
1815        let yaml = r#"
1816default:
1817    image: alpine:latest
1818    before_script:
1819        - echo from default
1820
1821build:
1822    script:
1823        - make
1824"#;
1825        let graph = parse(yaml);
1826        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
1827        assert!(
1828            graph
1829                .completeness_gaps
1830                .iter()
1831                .any(|r| r.contains("default:") && r.contains("inherit")),
1832            "expected default-inheritance partial reason, got: {:?}",
1833            graph.completeness_gaps
1834        );
1835    }
1836
1837    #[test]
1838    fn inherit_key_marks_partial() {
1839        let yaml = r#"
1840variables:
1841    DEPLOY_TOKEN: "$CI_DEPLOY_TOKEN"
1842
1843deploy:
1844    inherit:
1845        variables: false
1846    script:
1847        - deploy.sh
1848"#;
1849        let graph = parse(yaml);
1850        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
1851        assert!(
1852            graph
1853                .completeness_gaps
1854                .iter()
1855                .any(|r| r.contains("job 'deploy' uses inherit:")),
1856            "expected inherit partial reason, got: {:?}",
1857            graph.completeness_gaps
1858        );
1859    }
1860
1861    #[test]
1862    fn extends_marks_graph_partial() {
1863        let yaml = r#"
1864.base:
1865  script:
1866    - echo base
1867
1868my-job:
1869  extends: .base
1870  stage: build
1871"#;
1872        let graph = parse(yaml);
1873        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
1874        // Two structural gaps: the hidden `.base` template job and the
1875        // `extends:` inheritance on my-job.
1876        assert!(
1877            graph
1878                .completeness_gap_kinds
1879                .iter()
1880                .all(|k| *k == GapKind::Structural),
1881            "expected all gaps Structural, got: {:?}",
1882            graph.completeness_gap_kinds
1883        );
1884    }
1885
1886    #[test]
1887    fn meta_job_name_set_on_step_nodes() {
1888        let yaml = r#"
1889build:
1890  script:
1891    - make
1892deploy:
1893  script:
1894    - deploy.sh
1895"#;
1896        let graph = parse(yaml);
1897        let steps: Vec<_> = graph.nodes_of_kind(NodeKind::Step).collect();
1898        assert_eq!(steps.len(), 2);
1899        for step in &steps {
1900            assert!(
1901                step.metadata.contains_key(META_JOB_NAME),
1902                "Step '{}' missing META_JOB_NAME",
1903                step.name
1904            );
1905        }
1906        // Verify job names are correct
1907        let names: Vec<_> = steps
1908            .iter()
1909            .map(|s| s.metadata.get(META_JOB_NAME).unwrap().as_str())
1910            .collect();
1911        assert!(names.contains(&"build"), "got: {names:?}");
1912        assert!(names.contains(&"deploy"), "got: {names:?}");
1913    }
1914
1915    #[test]
1916    fn reserved_keywords_not_parsed_as_jobs() {
1917        let yaml = r#"
1918stages:
1919  - build
1920  - test
1921
1922variables:
1923  MY_VAR: value
1924
1925image: alpine:latest
1926
1927build:
1928  stage: build
1929  script:
1930    - make
1931"#;
1932        let graph = parse(yaml);
1933        let steps: Vec<_> = graph.nodes_of_kind(NodeKind::Step).collect();
1934        assert_eq!(
1935            steps.len(),
1936            1,
1937            "only 'build' should be a Step, got: {:?}",
1938            steps.iter().map(|s| &s.name).collect::<Vec<_>>()
1939        );
1940        assert_eq!(steps[0].name, "build");
1941    }
1942
1943    #[test]
1944    fn services_emit_image_nodes() {
1945        let yaml = r#"
1946test:
1947  services:
1948    - docker:dind
1949    - name: postgres:14
1950  script:
1951    - run_tests
1952"#;
1953        let graph = parse(yaml);
1954        let images: Vec<_> = graph.nodes_of_kind(NodeKind::Image).collect();
1955        assert_eq!(
1956            images.len(),
1957            2,
1958            "expected 2 service Image nodes, got: {:?}",
1959            images.iter().map(|i| &i.name).collect::<Vec<_>>()
1960        );
1961    }
1962
1963    // ── Cross-platform misclassification trap (red-team R2 #5) ─────
1964
1965    #[test]
1966    fn job_carrier_with_unparseable_bodies_marks_partial() {
1967        // Top-level keys that look like job names but whose values are not
1968        // mappings (lists, scalars). GitLab parser would normally produce a
1969        // Step per non-reserved mapping-valued key; here every candidate is
1970        // skipped because the value is not a mapping. Result: 0 step nodes
1971        // despite a non-empty job carrier — must mark Partial.
1972        let yaml = r#"
1973build:
1974  - this is a list, not a mapping
1975test:
1976  - also a list
1977"#;
1978        let graph = parse(yaml);
1979        let step_count = graph
1980            .nodes
1981            .iter()
1982            .filter(|n| n.kind == NodeKind::Step)
1983            .count();
1984        // Note: the "had_job_carrier" heuristic only fires when the value IS
1985        // a mapping, so this case (non-mapping values) does NOT trigger the
1986        // partial — that's intentional. The heuristic targets the trap where
1987        // an attacker uses a *valid mapping shape* the GitLab parser can't
1988        // interpret.
1989        assert_eq!(step_count, 0);
1990        assert_eq!(
1991            graph.completeness,
1992            AuthorityCompleteness::Complete,
1993            "non-mapping values are not job carriers"
1994        );
1995    }
1996
1997    // ── Regression tests for F1-F6 (gitlab-parser deep review) ──────────
1998
1999    /// F1: `$CI_COMMIT_REF_PROTECTED == "true"` stamps protected-only;
2000    /// the negation `== "false"` must NOT stamp.
2001    #[test]
2002    fn protected_ref_only_stamps_meta_when_truly_positive() {
2003        let positive = r#"
2004deploy:
2005  rules:
2006    - if: '$CI_COMMIT_REF_PROTECTED == "true"'
2007  script:
2008    - deploy.sh
2009"#;
2010        let graph = parse(positive);
2011        let step = graph.nodes_of_kind(NodeKind::Step).next().unwrap();
2012        assert_eq!(
2013            step.metadata
2014                .get(META_RULES_PROTECTED_ONLY)
2015                .map(String::as_str),
2016            Some("true"),
2017            "positive == \"true\" comparison must stamp META_RULES_PROTECTED_ONLY"
2018        );
2019
2020        let negation = r#"
2021deploy:
2022  rules:
2023    - if: '$CI_COMMIT_REF_PROTECTED == "false"'
2024  script:
2025    - deploy.sh
2026"#;
2027        let graph = parse(negation);
2028        let step = graph.nodes_of_kind(NodeKind::Step).next().unwrap();
2029        assert!(
2030            !step.metadata.contains_key(META_RULES_PROTECTED_ONLY),
2031            "== \"false\" is the OPPOSITE signal — must NOT stamp META_RULES_PROTECTED_ONLY (got: {:?})",
2032            step.metadata.get(META_RULES_PROTECTED_ONLY)
2033        );
2034
2035        // `!= "true"` is also a negation — must not stamp.
2036        let inequality = r#"
2037deploy:
2038  rules:
2039    - if: '$CI_COMMIT_REF_PROTECTED != "true"'
2040  script:
2041    - deploy.sh
2042"#;
2043        let graph = parse(inequality);
2044        let step = graph.nodes_of_kind(NodeKind::Step).next().unwrap();
2045        assert!(
2046            !step.metadata.contains_key(META_RULES_PROTECTED_ONLY),
2047            "!= \"true\" is a negation — must NOT stamp META_RULES_PROTECTED_ONLY"
2048        );
2049
2050        // `$CI_COMMIT_TAG_MESSAGE` substring trap — used to match because
2051        // `if_expr.contains("$CI_COMMIT_TAG")` was true even though the var
2052        // is a different one.
2053        let tag_message_trap = r#"
2054deploy:
2055  rules:
2056    - if: '$CI_COMMIT_TAG_MESSAGE == "release"'
2057  script:
2058    - deploy.sh
2059"#;
2060        let graph = parse(tag_message_trap);
2061        let step = graph.nodes_of_kind(NodeKind::Step).next().unwrap();
2062        assert!(
2063            !step.metadata.contains_key(META_RULES_PROTECTED_ONLY),
2064            "$CI_COMMIT_TAG_MESSAGE must not match the $CI_COMMIT_TAG predicate"
2065        );
2066    }
2067
2068    /// F2: `$CI_PIPELINE_SOURCE != "merge_request_event"` ("run except on MRs")
2069    /// must NOT stamp `META_TRIGGER=merge_request`. Only the positive form
2070    /// counts.
2071    #[test]
2072    fn mr_trigger_detection_rejects_negation() {
2073        let negation = r#"
2074build:
2075  rules:
2076    - if: '$CI_PIPELINE_SOURCE != "merge_request_event"'
2077  script:
2078    - make build
2079"#;
2080        let graph = parse(negation);
2081        assert!(
2082            graph.metadata.get(META_TRIGGER).map(String::as_str) != Some("merge_request"),
2083            "negation form must not stamp META_TRIGGER=merge_request, got: {:?}",
2084            graph.metadata.get(META_TRIGGER)
2085        );
2086        let steps: Vec<_> = graph.nodes_of_kind(NodeKind::Step).collect();
2087        assert_eq!(steps.len(), 1);
2088        assert!(
2089            steps[0].metadata.get(META_TRIGGER).map(String::as_str) != Some("merge_request"),
2090            "negation form must not stamp per-step META_TRIGGER=merge_request"
2091        );
2092
2093        // Positive form still works.
2094        let positive = r#"
2095build:
2096  rules:
2097    - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
2098  script:
2099    - make build
2100"#;
2101        let graph = parse(positive);
2102        assert_eq!(
2103            graph.metadata.get(META_TRIGGER).map(String::as_str),
2104            Some("merge_request"),
2105            "positive form must still stamp META_TRIGGER=merge_request"
2106        );
2107    }
2108
2109    /// F3: list-form `aud:` produces `META_OIDC_AUDIENCES` (plural) and a
2110    /// comma-joined `META_OIDC_AUDIENCE` for backward compat. Scalar form
2111    /// stamps only `META_OIDC_AUDIENCE` and leaves the plural marker absent.
2112    #[test]
2113    fn id_tokens_aud_list_form_creates_audiences_metadata() {
2114        let yaml = r#"
2115deploy:
2116  id_tokens:
2117    MULTI_CLOUD_TOKEN:
2118      aud:
2119        - https://aws.amazonaws.com
2120        - https://gcp.googleapis.com
2121  script:
2122    - deploy.sh
2123"#;
2124        let graph = parse(yaml);
2125        let oidc: Vec<_> = graph
2126            .nodes_of_kind(NodeKind::Identity)
2127            .filter(|n| n.metadata.get(META_OIDC).map(String::as_str) == Some("true"))
2128            .collect();
2129        assert_eq!(oidc.len(), 1);
2130        assert_eq!(
2131            oidc[0]
2132                .metadata
2133                .get(META_OIDC_AUDIENCES)
2134                .map(String::as_str),
2135            Some("https://aws.amazonaws.com,https://gcp.googleapis.com"),
2136            "list-form aud must stamp comma-joined META_OIDC_AUDIENCES"
2137        );
2138        // Backward compat: META_OIDC_AUDIENCE holds the same comma-joined value
2139        // (no longer "unknown" as it was before the fix).
2140        assert_eq!(
2141            oidc[0].metadata.get(META_OIDC_AUDIENCE).map(String::as_str),
2142            Some("https://aws.amazonaws.com,https://gcp.googleapis.com"),
2143        );
2144        assert!(oidc[0].name.contains("aud=https://aws"));
2145
2146        // Scalar form: META_OIDC_AUDIENCE is the bare string, plural marker absent.
2147        let scalar = r#"
2148deploy:
2149  id_tokens:
2150    AWS_TOKEN:
2151      aud: https://sts.amazonaws.com
2152  script:
2153    - deploy.sh
2154"#;
2155        let graph = parse(scalar);
2156        let oidc: Vec<_> = graph
2157            .nodes_of_kind(NodeKind::Identity)
2158            .filter(|n| n.metadata.get(META_OIDC).map(String::as_str) == Some("true"))
2159            .collect();
2160        assert_eq!(
2161            oidc[0].metadata.get(META_OIDC_AUDIENCE).map(String::as_str),
2162            Some("https://sts.amazonaws.com")
2163        );
2164        assert!(
2165            !oidc[0].metadata.contains_key(META_OIDC_AUDIENCES),
2166            "scalar form must NOT set the plural META_OIDC_AUDIENCES marker"
2167        );
2168    }
2169
2170    /// F4: `is_credential_name` must boundary-check; substring matches like
2171    /// `CERTAIN_FLAG` (contains `CERT`), `TOKENIZER_VERSION` (contains `TOKEN`),
2172    /// `UNCERTAIN`, and `CERTIFICATE_PATH` (path config, not a credential)
2173    /// must all return false. Real credentials still match.
2174    #[test]
2175    fn is_credential_name_boundary_checks() {
2176        // False positives that the substring matcher used to flag.
2177        assert!(!is_credential_name("CERTAIN_FLAG"));
2178        assert!(!is_credential_name("TOKENIZER_VERSION"));
2179        assert!(!is_credential_name("UNCERTAIN"));
2180        assert!(!is_credential_name("CERTIFICATE_PATH"));
2181        assert!(!is_credential_name("TOKEN1"));
2182        assert!(!is_credential_name("CERTIFICATE"));
2183
2184        // True positives — must still match.
2185        assert!(is_credential_name("API_TOKEN"));
2186        assert!(is_credential_name("MY_CERT"));
2187        assert!(is_credential_name("DB_PASSWORD"));
2188        assert!(is_credential_name("DEPLOY_TOKEN"));
2189        assert!(is_credential_name("SIGNING_KEY"));
2190        assert!(is_credential_name("AWS_SECRET_ACCESS_KEY"));
2191        assert!(is_credential_name("TOKEN"));
2192        assert!(is_credential_name("CERT"));
2193        assert!(is_credential_name("PRIVATE_KEY"));
2194        assert!(is_credential_name("CREDENTIAL"));
2195    }
2196
2197    /// F5: a `needs:` entry with `artifacts: false` does NOT promote the
2198    /// upstream's dotenv into this job, so it must be excluded from
2199    /// `META_NEEDS` (the dotenv-flow rule reads that CSV verbatim).
2200    #[test]
2201    fn needs_artifacts_false_excludes_dotenv_flow() {
2202        let yaml = r#"
2203build:
2204  artifacts:
2205    reports:
2206      dotenv: build.env
2207  script:
2208    - make build
2209deploy:
2210  needs:
2211    - job: build
2212      artifacts: false
2213  script:
2214    - kubectl apply
2215"#;
2216        let graph = parse(yaml);
2217        let deploy_step = graph
2218            .nodes_of_kind(NodeKind::Step)
2219            .find(|n| n.metadata.get(META_JOB_NAME).map(String::as_str) == Some("deploy"))
2220            .expect("deploy step present");
2221        let needs_csv = deploy_step
2222            .metadata
2223            .get(META_NEEDS)
2224            .map(String::as_str)
2225            .unwrap_or("");
2226        assert!(
2227            !needs_csv.split(',').any(|s| s == "build"),
2228            "build must be excluded from META_NEEDS when artifacts: false (got: {needs_csv:?})"
2229        );
2230
2231        // Sanity check: same YAML with `artifacts: true` (or missing) still
2232        // includes the upstream so dotenv-flow rules can fire.
2233        let yaml_default = r#"
2234build:
2235  artifacts:
2236    reports:
2237      dotenv: build.env
2238  script:
2239    - make build
2240deploy:
2241  needs:
2242    - job: build
2243  script:
2244    - kubectl apply
2245"#;
2246        let graph = parse(yaml_default);
2247        let deploy_step = graph
2248            .nodes_of_kind(NodeKind::Step)
2249            .find(|n| n.metadata.get(META_JOB_NAME).map(String::as_str) == Some("deploy"))
2250            .expect("deploy step present");
2251        let needs_csv = deploy_step
2252            .metadata
2253            .get(META_NEEDS)
2254            .map(String::as_str)
2255            .unwrap_or("");
2256        assert!(
2257            needs_csv.split(',').any(|s| s == "build"),
2258            "default (artifacts implicitly true) must keep build in META_NEEDS (got: {needs_csv:?})"
2259        );
2260    }
2261
2262    /// Roadmap R3 / Phase 1B: conditional `rules:variables` changes variable
2263    /// scope based on an expression. Until expressions are evaluated, the graph
2264    /// must be Partial rather than silently claiming static completeness.
2265    #[test]
2266    fn rules_variables_mark_typed_expression_gap() {
2267        let yaml = r#"
2268workflow:
2269  rules:
2270    - if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH'
2271      variables:
2272        DEPLOY_TOKEN: "$PROD_DEPLOY_TOKEN"
2273
2274deploy:
2275  rules:
2276    - if: '$CI_COMMIT_REF_PROTECTED == "true"'
2277      variables:
2278        CLOUD_PASSWORD: "$PROD_CLOUD_PASSWORD"
2279  script:
2280    - deploy.sh
2281"#;
2282        let graph = parse(yaml);
2283        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
2284        assert_eq!(
2285            graph.completeness_gap_kinds,
2286            vec![GapKind::Expression, GapKind::Expression],
2287            "workflow and job rules:variables should each produce an expression gap"
2288        );
2289        assert!(
2290            graph
2291                .completeness_gaps
2292                .iter()
2293                .any(|gap| gap.contains("workflow:rules:variables define conditional variables")),
2294            "workflow rules:variables gap missing: {:?}",
2295            graph.completeness_gaps
2296        );
2297        assert!(
2298            graph
2299                .completeness_gaps
2300                .iter()
2301                .any(|gap| gap.contains("job 'deploy' uses rules:variables")),
2302            "job rules:variables gap missing: {:?}",
2303            graph.completeness_gaps
2304        );
2305    }
2306
2307    /// F6: 9× parse with bucket-defeating key names — even if a future
2308    /// refactor swapped the indexmap-backed mapping for a HashMap-backed
2309    /// one, the explicit sort would keep NodeId order byte-identical.
2310    #[test]
2311    fn gitlab_mapping_iteration_is_deterministic_across_runs() {
2312        // Names chosen to spread across hash buckets.
2313        let yaml = r#"
2314zeta-job:
2315  variables:
2316    ZZ_TOKEN: "$CI_TOKEN"
2317    AA_PASSWORD: "x"
2318    MM_SECRET: "y"
2319  script:
2320    - echo zeta
2321alpha-job:
2322  variables:
2323    QQ_TOKEN: "$CI_TOKEN"
2324    BB_API_KEY: "z"
2325  script:
2326    - echo alpha
2327mid-job:
2328  variables:
2329    NN_PRIVATE_KEY: "k"
2330    GG_SIGNING_KEY: "j"
2331  script:
2332    - echo mid
2333"#;
2334        let canonical: Vec<(NodeKind, String)> = parse(yaml)
2335            .nodes
2336            .iter()
2337            .map(|n| (n.kind, n.name.clone()))
2338            .collect();
2339        for run in 0..9 {
2340            let again: Vec<(NodeKind, String)> = parse(yaml)
2341                .nodes
2342                .iter()
2343                .map(|n| (n.kind, n.name.clone()))
2344                .collect();
2345            assert_eq!(
2346                again, canonical,
2347                "run {run}: NodeId order must be byte-identical across runs"
2348            );
2349        }
2350    }
2351
2352    #[test]
2353    fn mapping_jobs_without_recognisable_step_content_marks_partial() {
2354        // A non-reserved top-level key whose value is a mapping but contains
2355        // only ADO-style fields (`task:`, `azureSubscription`) — and `extends`
2356        // marks the job as partial without creating a Step. Wait: the GitLab
2357        // parser actually still adds a Step node for any mapping-valued
2358        // non-reserved key. So to get the 0-step + had_carrier shape, we
2359        // need a hidden/template job (starts with '.') as the only candidate.
2360        let yaml = r#"
2361.template-only:
2362  script:
2363    - echo "this is a template-only file"
2364"#;
2365        let graph = parse(yaml);
2366        let step_count = graph
2367            .nodes
2368            .iter()
2369            .filter(|n| n.kind == NodeKind::Step)
2370            .count();
2371        assert_eq!(step_count, 0);
2372        // Hidden jobs already mark partial with their own reason.
2373        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
2374        // The hidden `.template-only` job is a Structural gap. The zero-steps
2375        // fall-through does NOT fire here because `had_job_carrier` only
2376        // counts non-dot-prefixed mapping-valued top-level keys.
2377        assert_eq!(graph.completeness_gap_kinds[0], GapKind::Structural);
2378    }
2379}