Skip to main content

taudit_parse_gitlab/
lib.rs

1use std::collections::HashMap;
2
3use serde::Deserialize;
4use serde_yaml::Value;
5use taudit_core::error::TauditError;
6use taudit_core::graph::*;
7use taudit_core::ports::PipelineParser;
8
9/// GitLab CI YAML parser.
10///
11/// Parses `.gitlab-ci.yml` files into an `AuthorityGraph`. The authority model:
12/// - Each job is a `Step` node.
13/// - `CI_JOB_TOKEN` is a global implicit `Identity` (always present, scope=broad).
14/// - `secrets:` entries emit `Secret` nodes with `HasAccessTo` edges.
15/// - `id_tokens:` entries emit OIDC `Identity` nodes.
16/// - `variables:` entries with credential-pattern names emit `Secret` nodes.
17/// - `image:` and `services:` emit `Image` nodes with `UsesImage` edges.
18/// - `include:` and `extends:` mark the graph `Partial`.
19/// - `rules: if: merge_request_event` and `only: merge_requests` set `META_TRIGGER`.
20pub struct GitlabParser;
21
22/// Reserved top-level keys that are not job definitions.
23const RESERVED: &[&str] = &[
24    "stages",
25    "workflow",
26    "include",
27    "variables",
28    "image",
29    "services",
30    "default",
31    "cache",
32    "before_script",
33    "after_script",
34    "types",
35];
36
37/// Variable name fragments that indicate a credential rather than plain config.
38const CRED_FRAGMENTS: &[&str] = &[
39    "TOKEN",
40    "SECRET",
41    "PASSWORD",
42    "PASSWD",
43    "PRIVATE_KEY",
44    "API_KEY",
45    "APIKEY",
46    "SIGNING_KEY",
47    "ACCESS_KEY",
48    "SERVICE_ACCOUNT",
49    "CERT",
50    "CREDENTIAL",
51];
52
53impl PipelineParser for GitlabParser {
54    fn platform(&self) -> &str {
55        "gitlab-ci"
56    }
57
58    fn parse(&self, content: &str, source: &PipelineSource) -> Result<AuthorityGraph, TauditError> {
59        let mut de = serde_yaml::Deserializer::from_str(content);
60        let doc = de
61            .next()
62            .ok_or_else(|| TauditError::Parse("empty YAML document".into()))?;
63        let root: Value = Value::deserialize(doc)
64            .map_err(|e| TauditError::Parse(format!("YAML parse error: {e}")))?;
65
66        let mapping = root
67            .as_mapping()
68            .ok_or_else(|| TauditError::Parse("GitLab CI root must be a mapping".into()))?;
69
70        let mut graph = AuthorityGraph::new(source.clone());
71
72        // CI_JOB_TOKEN is always present in every GitLab CI job — it's the built-in
73        // platform token, equivalent to ADO's System.AccessToken or GHA's GITHUB_TOKEN.
74        let mut meta = HashMap::new();
75        meta.insert(META_IDENTITY_SCOPE.into(), "broad".into());
76        meta.insert(META_IMPLICIT.into(), "true".into());
77        let token_id = graph.add_node_with_metadata(
78            NodeKind::Identity,
79            "CI_JOB_TOKEN",
80            TrustZone::FirstParty,
81            meta,
82        );
83
84        // Top-level include: → mark Partial immediately
85        if mapping.contains_key("include") {
86            graph.mark_partial(
87                "include: directive present — included templates not resolved".to_string(),
88            );
89        }
90
91        // Global variables
92        let global_secrets = process_variables(mapping.get("variables"), &mut graph, "pipeline");
93
94        // Global image
95        let global_image = mapping.get("image").and_then(extract_image_str);
96
97        // Top-level merge_request trigger detection from `workflow:` rules
98        if let Some(wf) = mapping.get("workflow") {
99            if has_mr_trigger_in_workflow(wf) {
100                graph
101                    .metadata
102                    .insert(META_TRIGGER.into(), "merge_request".into());
103            }
104        }
105
106        // Process each job (any top-level key not in RESERVED)
107        for (key, value) in mapping {
108            let job_name = match key.as_str() {
109                Some(k) => k,
110                None => continue,
111            };
112            if RESERVED.contains(&job_name) {
113                continue;
114            }
115
116            // Hidden jobs (starting with a dot) are templates — mark Partial, skip
117            if job_name.starts_with('.') {
118                graph.mark_partial(format!(
119                    "job '{job_name}' is a hidden/template job — not resolved"
120                ));
121                continue;
122            }
123
124            let job_map = match value.as_mapping() {
125                Some(m) => m,
126                None => continue,
127            };
128
129            // extends: — job template inheritance, can't resolve statically
130            if job_map.contains_key("extends") {
131                graph.mark_partial(format!(
132                    "job '{job_name}' uses extends: — inherited configuration not resolved"
133                ));
134            }
135
136            // Detect PR/MR trigger in this job's rules: or only:
137            let job_triggers_mr = job_has_mr_trigger(job_map);
138
139            // Propagate job MR trigger to graph level
140            if job_triggers_mr && !graph.metadata.contains_key(META_TRIGGER) {
141                graph
142                    .metadata
143                    .insert(META_TRIGGER.into(), "merge_request".into());
144            }
145
146            // Job-level variables
147            let job_secrets = process_variables(job_map.get("variables"), &mut graph, job_name);
148
149            // Job-level explicit secrets: (Vault, AWS Secrets Manager, GCP, Azure)
150            let explicit_secrets =
151                process_explicit_secrets(job_map.get("secrets"), job_name, &mut graph);
152
153            // Job-level OIDC tokens (id_tokens:)
154            let oidc_identities = process_id_tokens(job_map.get("id_tokens"), job_name, &mut graph);
155
156            // Job image (falls back to global)
157            let job_image_str = job_map
158                .get("image")
159                .and_then(extract_image_str)
160                .or(global_image.as_deref().map(String::from));
161
162            let image_id = job_image_str.as_deref().map(|img| {
163                let pinned = is_docker_digest_pinned(img);
164                let trust_zone = if pinned {
165                    TrustZone::ThirdParty
166                } else {
167                    TrustZone::Untrusted
168                };
169                let mut imeta = HashMap::new();
170                if let Some(digest) = img.split("@sha256:").nth(1) {
171                    imeta.insert(META_DIGEST.into(), format!("sha256:{digest}"));
172                }
173                graph.add_node_with_metadata(NodeKind::Image, img, trust_zone, imeta)
174            });
175
176            // Services (each is an Image node)
177            let service_ids = process_services(job_map.get("services"), &mut graph);
178
179            // Environment — record name as metadata, sets trust boundary marker
180            let env_name = job_map
181                .get("environment")
182                .and_then(extract_environment_name);
183
184            // Create the Step node for this job
185            let mut step_meta = HashMap::new();
186            step_meta.insert(META_JOB_NAME.into(), job_name.to_string());
187            if let Some(ref env) = env_name {
188                step_meta.insert("environment_name".into(), env.clone());
189            }
190            let step_id = graph.add_node_with_metadata(
191                NodeKind::Step,
192                job_name,
193                TrustZone::FirstParty,
194                step_meta,
195            );
196
197            // CI_JOB_TOKEN always available to every step
198            graph.add_edge(step_id, token_id, EdgeKind::HasAccessTo);
199
200            // Link all secrets
201            for &sid in global_secrets
202                .iter()
203                .chain(&job_secrets)
204                .chain(&explicit_secrets)
205            {
206                graph.add_edge(step_id, sid, EdgeKind::HasAccessTo);
207            }
208
209            // Link OIDC identities
210            for &iid in &oidc_identities {
211                graph.add_edge(step_id, iid, EdgeKind::HasAccessTo);
212            }
213
214            // UsesImage edges
215            if let Some(img_id) = image_id {
216                graph.add_edge(step_id, img_id, EdgeKind::UsesImage);
217            }
218            for &svc_id in &service_ids {
219                graph.add_edge(step_id, svc_id, EdgeKind::UsesImage);
220            }
221        }
222
223        Ok(graph)
224    }
225}
226
227/// Detect `image:` string from a YAML value — can be a bare string or a mapping with `name:`.
228fn extract_image_str(v: &Value) -> Option<String> {
229    match v {
230        Value::String(s) => Some(s.clone()),
231        Value::Mapping(m) => m.get("name").and_then(|n| n.as_str()).map(String::from),
232        _ => None,
233    }
234}
235
236/// Extract environment name from `environment:` value (string or mapping).
237fn extract_environment_name(v: &Value) -> Option<String> {
238    match v {
239        Value::String(s) => Some(s.clone()),
240        Value::Mapping(m) => m.get("name").and_then(|n| n.as_str()).map(String::from),
241        _ => None,
242    }
243}
244
245/// Classify a variable name as a credential by checking for common fragments.
246fn is_credential_name(name: &str) -> bool {
247    let upper = name.to_uppercase();
248    CRED_FRAGMENTS.iter().any(|frag| upper.contains(frag))
249}
250
251/// Parse `variables:` mapping and emit `Secret` nodes for credential-pattern names.
252/// Returns the list of created node IDs.
253fn process_variables(vars: Option<&Value>, graph: &mut AuthorityGraph, scope: &str) -> Vec<NodeId> {
254    let mut ids = Vec::new();
255    let map = match vars.and_then(|v| v.as_mapping()) {
256        Some(m) => m,
257        None => return ids,
258    };
259    for (k, _v) in map {
260        let name = match k.as_str() {
261            Some(s) => s,
262            None => continue,
263        };
264        if is_credential_name(name) {
265            let id = graph.add_node(NodeKind::Secret, name, TrustZone::FirstParty);
266            ids.push(id);
267            let _ = scope; // used for future scoped error messages
268        }
269    }
270    ids
271}
272
273/// Parse `secrets:` block and emit one `Secret` node per named secret.
274///
275/// GitLab CI `secrets:` format:
276/// ```yaml
277/// secrets:
278///   DATABASE_PASSWORD:
279///     vault: production/db/password@secret
280///   AWS_KEY:
281///     aws_secrets_manager:
282///       name: my-secret
283/// ```
284fn process_explicit_secrets(
285    secrets: Option<&Value>,
286    _scope: &str,
287    graph: &mut AuthorityGraph,
288) -> Vec<NodeId> {
289    let mut ids = Vec::new();
290    let map = match secrets.and_then(|v| v.as_mapping()) {
291        Some(m) => m,
292        None => return ids,
293    };
294    for (k, _v) in map {
295        let name = match k.as_str() {
296            Some(s) => s,
297            None => continue,
298        };
299        let id = graph.add_node(NodeKind::Secret, name, TrustZone::FirstParty);
300        ids.push(id);
301    }
302    ids
303}
304
305/// Parse `id_tokens:` block and emit one OIDC `Identity` node per token.
306///
307/// GitLab CI `id_tokens:` format:
308/// ```yaml
309/// id_tokens:
310///   SIGSTORE_ID_TOKEN:
311///     aud: sigstore
312///   AWS_OIDC_TOKEN:
313///     aud: https://sts.amazonaws.com
314/// ```
315fn process_id_tokens(
316    id_tokens: Option<&Value>,
317    _scope: &str,
318    graph: &mut AuthorityGraph,
319) -> Vec<NodeId> {
320    let mut ids = Vec::new();
321    let map = match id_tokens.and_then(|v| v.as_mapping()) {
322        Some(m) => m,
323        None => return ids,
324    };
325    for (k, v) in map {
326        let token_name = match k.as_str() {
327            Some(s) => s,
328            None => continue,
329        };
330        // Extract audience for labelling
331        let aud = v
332            .as_mapping()
333            .and_then(|m| m.get("aud"))
334            .and_then(|a| a.as_str())
335            .unwrap_or("unknown");
336        let label = format!("{token_name} (aud={aud})");
337        let mut meta = HashMap::new();
338        meta.insert(META_OIDC.into(), "true".into());
339        meta.insert(META_IDENTITY_SCOPE.into(), "broad".into());
340        let id =
341            graph.add_node_with_metadata(NodeKind::Identity, label, TrustZone::FirstParty, meta);
342        ids.push(id);
343    }
344    ids
345}
346
347/// Parse `services:` block and emit `Image` nodes.
348fn process_services(services: Option<&Value>, graph: &mut AuthorityGraph) -> Vec<NodeId> {
349    let mut ids = Vec::new();
350    let list = match services.and_then(|v| v.as_sequence()) {
351        Some(s) => s,
352        None => return ids,
353    };
354    for item in list {
355        let img_str = match extract_image_str(item) {
356            Some(s) => s,
357            None => continue,
358        };
359        let pinned = is_docker_digest_pinned(&img_str);
360        let trust_zone = if pinned {
361            TrustZone::ThirdParty
362        } else {
363            TrustZone::Untrusted
364        };
365        let mut meta = HashMap::new();
366        if let Some(digest) = img_str.split("@sha256:").nth(1) {
367            meta.insert(META_DIGEST.into(), format!("sha256:{digest}"));
368        }
369        let id = graph.add_node_with_metadata(NodeKind::Image, &img_str, trust_zone, meta);
370        ids.push(id);
371    }
372    ids
373}
374
375/// Check whether a job's `rules:` or `only:` indicates it runs on merge requests.
376fn job_has_mr_trigger(job_map: &serde_yaml::Mapping) -> bool {
377    // rules: [{if: '$CI_PIPELINE_SOURCE == "merge_request_event"'}]
378    if let Some(rules) = job_map.get("rules").and_then(|v| v.as_sequence()) {
379        for rule in rules {
380            if let Some(if_expr) = rule
381                .as_mapping()
382                .and_then(|m| m.get("if"))
383                .and_then(|v| v.as_str())
384            {
385                if if_expr.contains("merge_request_event") {
386                    return true;
387                }
388            }
389        }
390    }
391    // only: [merge_requests] or only: {refs: [merge_requests]}
392    if let Some(only) = job_map.get("only") {
393        if only_has_merge_requests(only) {
394            return true;
395        }
396    }
397    false
398}
399
400/// Check `only:` value (sequence or mapping) for `merge_requests` entry.
401fn only_has_merge_requests(v: &Value) -> bool {
402    match v {
403        Value::Sequence(seq) => seq
404            .iter()
405            .any(|item| item.as_str() == Some("merge_requests")),
406        Value::Mapping(m) => {
407            if let Some(refs) = m.get("refs").and_then(|r| r.as_sequence()) {
408                return refs
409                    .iter()
410                    .any(|item| item.as_str() == Some("merge_requests"));
411            }
412            false
413        }
414        _ => false,
415    }
416}
417
418/// Check top-level `workflow:` rules for MR trigger.
419fn has_mr_trigger_in_workflow(wf: &Value) -> bool {
420    let rules = match wf
421        .as_mapping()
422        .and_then(|m| m.get("rules"))
423        .and_then(|r| r.as_sequence())
424    {
425        Some(r) => r,
426        None => return false,
427    };
428    for rule in rules {
429        if let Some(if_expr) = rule
430            .as_mapping()
431            .and_then(|m| m.get("if"))
432            .and_then(|v| v.as_str())
433        {
434            if if_expr.contains("merge_request_event") {
435                return true;
436            }
437        }
438    }
439    false
440}
441
442#[cfg(test)]
443mod tests {
444    use super::*;
445
446    fn parse(yaml: &str) -> AuthorityGraph {
447        let parser = GitlabParser;
448        let source = PipelineSource {
449            file: ".gitlab-ci.yml".into(),
450            repo: None,
451            git_ref: None,
452            commit_sha: None,
453        };
454        parser.parse(yaml, &source).unwrap()
455    }
456
457    #[test]
458    fn ci_job_token_always_present() {
459        let yaml = r#"
460stages:
461  - build
462
463build-job:
464  stage: build
465  script:
466    - make build
467"#;
468        let graph = parse(yaml);
469        let identities: Vec<_> = graph.nodes_of_kind(NodeKind::Identity).collect();
470        assert_eq!(identities.len(), 1);
471        assert_eq!(identities[0].name, "CI_JOB_TOKEN");
472        assert_eq!(
473            identities[0]
474                .metadata
475                .get(META_IMPLICIT)
476                .map(String::as_str),
477            Some("true")
478        );
479        assert_eq!(
480            identities[0]
481                .metadata
482                .get(META_IDENTITY_SCOPE)
483                .map(String::as_str),
484            Some("broad")
485        );
486    }
487
488    #[test]
489    fn global_credential_variable_emits_secret_node() {
490        let yaml = r#"
491variables:
492  APP_VERSION: "1.0"
493  DEPLOY_TOKEN: "$CI_DEPLOY_TOKEN"
494
495build-job:
496  script:
497    - make
498"#;
499        let graph = parse(yaml);
500        let secrets: Vec<_> = graph.nodes_of_kind(NodeKind::Secret).collect();
501        assert!(
502            secrets.iter().any(|s| s.name == "DEPLOY_TOKEN"),
503            "DEPLOY_TOKEN must emit a Secret node, got: {:?}",
504            secrets.iter().map(|s| &s.name).collect::<Vec<_>>()
505        );
506        // Plain config variable must not emit Secret
507        assert!(
508            !secrets.iter().any(|s| s.name == "APP_VERSION"),
509            "APP_VERSION must not emit a Secret node"
510        );
511    }
512
513    #[test]
514    fn floating_image_emits_untrusted_image_node() {
515        let yaml = r#"
516deploy:
517  image: alpine:latest
518  script:
519    - deploy.sh
520"#;
521        let graph = parse(yaml);
522        let images: Vec<_> = graph.nodes_of_kind(NodeKind::Image).collect();
523        assert_eq!(images.len(), 1);
524        assert_eq!(images[0].name, "alpine:latest");
525        assert_eq!(images[0].trust_zone, TrustZone::Untrusted);
526    }
527
528    #[test]
529    fn digest_pinned_image_is_third_party() {
530        let yaml = r#"
531deploy:
532  image: "alpine@sha256:a5ac7e51b41094c92402da3b24376905380afc29a5ac7e51b41094c92402da3b"
533  script:
534    - deploy.sh
535"#;
536        let graph = parse(yaml);
537        let images: Vec<_> = graph.nodes_of_kind(NodeKind::Image).collect();
538        assert_eq!(images.len(), 1);
539        assert_eq!(images[0].trust_zone, TrustZone::ThirdParty);
540    }
541
542    #[test]
543    fn id_tokens_emit_oidc_identity_nodes() {
544        let yaml = r#"
545deploy:
546  id_tokens:
547    SIGSTORE_ID_TOKEN:
548      aud: sigstore
549    AWS_OIDC_TOKEN:
550      aud: https://sts.amazonaws.com
551  script:
552    - deploy.sh
553"#;
554        let graph = parse(yaml);
555        let oidc: Vec<_> = graph
556            .nodes_of_kind(NodeKind::Identity)
557            .filter(|n| n.metadata.get(META_OIDC).map(String::as_str) == Some("true"))
558            .collect();
559        assert_eq!(
560            oidc.len(),
561            2,
562            "expected 2 OIDC identity nodes, got: {:?}",
563            oidc.iter().map(|n| &n.name).collect::<Vec<_>>()
564        );
565    }
566
567    #[test]
568    fn explicit_secrets_emit_secret_nodes() {
569        let yaml = r#"
570deploy:
571  secrets:
572    DATABASE_PASSWORD:
573      vault: production/db/password@secret
574    AWS_KEY:
575      aws_secrets_manager:
576        name: my-secret
577  script:
578    - deploy.sh
579"#;
580        let graph = parse(yaml);
581        let secrets: Vec<_> = graph.nodes_of_kind(NodeKind::Secret).collect();
582        let names: Vec<_> = secrets.iter().map(|s| s.name.as_str()).collect();
583        assert!(names.contains(&"DATABASE_PASSWORD"), "got: {names:?}");
584        assert!(names.contains(&"AWS_KEY"), "got: {names:?}");
585    }
586
587    #[test]
588    fn rules_mr_trigger_sets_meta_trigger() {
589        let yaml = r#"
590test:
591  rules:
592    - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
593  script:
594    - run tests
595"#;
596        let graph = parse(yaml);
597        assert_eq!(
598            graph.metadata.get(META_TRIGGER).map(String::as_str),
599            Some("merge_request"),
600            "META_TRIGGER must be set to merge_request"
601        );
602    }
603
604    #[test]
605    fn only_merge_requests_sets_meta_trigger() {
606        let yaml = r#"
607test:
608  only:
609    - merge_requests
610  script:
611    - run tests
612"#;
613        let graph = parse(yaml);
614        assert_eq!(
615            graph.metadata.get(META_TRIGGER).map(String::as_str),
616            Some("merge_request")
617        );
618    }
619
620    #[test]
621    fn include_marks_graph_partial() {
622        let yaml = r#"
623include:
624  - local: '/templates/.base.yml'
625
626build:
627  script:
628    - make
629"#;
630        let graph = parse(yaml);
631        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
632    }
633
634    #[test]
635    fn extends_marks_graph_partial() {
636        let yaml = r#"
637.base:
638  script:
639    - echo base
640
641my-job:
642  extends: .base
643  stage: build
644"#;
645        let graph = parse(yaml);
646        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
647    }
648
649    #[test]
650    fn meta_job_name_set_on_step_nodes() {
651        let yaml = r#"
652build:
653  script:
654    - make
655deploy:
656  script:
657    - deploy.sh
658"#;
659        let graph = parse(yaml);
660        let steps: Vec<_> = graph.nodes_of_kind(NodeKind::Step).collect();
661        assert_eq!(steps.len(), 2);
662        for step in &steps {
663            assert!(
664                step.metadata.contains_key(META_JOB_NAME),
665                "Step '{}' missing META_JOB_NAME",
666                step.name
667            );
668        }
669        // Verify job names are correct
670        let names: Vec<_> = steps
671            .iter()
672            .map(|s| s.metadata.get(META_JOB_NAME).unwrap().as_str())
673            .collect();
674        assert!(names.contains(&"build"), "got: {names:?}");
675        assert!(names.contains(&"deploy"), "got: {names:?}");
676    }
677
678    #[test]
679    fn reserved_keywords_not_parsed_as_jobs() {
680        let yaml = r#"
681stages:
682  - build
683  - test
684
685variables:
686  MY_VAR: value
687
688image: alpine:latest
689
690build:
691  stage: build
692  script:
693    - make
694"#;
695        let graph = parse(yaml);
696        let steps: Vec<_> = graph.nodes_of_kind(NodeKind::Step).collect();
697        assert_eq!(
698            steps.len(),
699            1,
700            "only 'build' should be a Step, got: {:?}",
701            steps.iter().map(|s| &s.name).collect::<Vec<_>>()
702        );
703        assert_eq!(steps[0].name, "build");
704    }
705
706    #[test]
707    fn services_emit_image_nodes() {
708        let yaml = r#"
709test:
710  services:
711    - docker:dind
712    - name: postgres:14
713  script:
714    - run_tests
715"#;
716        let graph = parse(yaml);
717        let images: Vec<_> = graph.nodes_of_kind(NodeKind::Image).collect();
718        assert_eq!(
719            images.len(),
720            2,
721            "expected 2 service Image nodes, got: {:?}",
722            images.iter().map(|i| &i.name).collect::<Vec<_>>()
723        );
724    }
725}