Skip to main content

taudit_parse_gitlab/
lib.rs

1use std::collections::HashMap;
2
3use serde::Deserialize;
4use serde_yaml::Value;
5use taudit_core::error::TauditError;
6use taudit_core::graph::*;
7use taudit_core::ports::PipelineParser;
8
9/// GitLab CI YAML parser.
10///
11/// Parses `.gitlab-ci.yml` files into an `AuthorityGraph`. The authority model:
12/// - Each job is a `Step` node.
13/// - `CI_JOB_TOKEN` is a global implicit `Identity` (always present, scope=broad).
14/// - `secrets:` entries emit `Secret` nodes with `HasAccessTo` edges.
15/// - `id_tokens:` entries emit OIDC `Identity` nodes.
16/// - `variables:` entries with credential-pattern names emit `Secret` nodes.
17/// - `image:` and `services:` emit `Image` nodes with `UsesImage` edges.
18/// - `include:` and `extends:` mark the graph `Partial`.
19/// - `rules: if: merge_request_event` and `only: merge_requests` set `META_TRIGGER`.
20pub struct GitlabParser;
21
22/// Reserved top-level keys that are not job definitions.
23const RESERVED: &[&str] = &[
24    "stages",
25    "workflow",
26    "include",
27    "variables",
28    "image",
29    "services",
30    "default",
31    "cache",
32    "before_script",
33    "after_script",
34    "types",
35];
36
37/// Variable name fragments that indicate a credential rather than plain config.
38const CRED_FRAGMENTS: &[&str] = &[
39    "TOKEN",
40    "SECRET",
41    "PASSWORD",
42    "PASSWD",
43    "PRIVATE_KEY",
44    "API_KEY",
45    "APIKEY",
46    "SIGNING_KEY",
47    "ACCESS_KEY",
48    "SERVICE_ACCOUNT",
49    "CERT",
50    "CREDENTIAL",
51];
52
53impl PipelineParser for GitlabParser {
54    fn platform(&self) -> &str {
55        "gitlab-ci"
56    }
57
58    fn parse(&self, content: &str, source: &PipelineSource) -> Result<AuthorityGraph, TauditError> {
59        let mut de = serde_yaml::Deserializer::from_str(content);
60        let doc = de
61            .next()
62            .ok_or_else(|| TauditError::Parse("empty YAML document".into()))?;
63        let root: Value = Value::deserialize(doc)
64            .map_err(|e| TauditError::Parse(format!("YAML parse error: {e}")))?;
65
66        let mapping = root
67            .as_mapping()
68            .ok_or_else(|| TauditError::Parse("GitLab CI root must be a mapping".into()))?;
69
70        let mut graph = AuthorityGraph::new(source.clone());
71
72        // CI_JOB_TOKEN is always present in every GitLab CI job — it's the built-in
73        // platform token, equivalent to ADO's System.AccessToken or GHA's GITHUB_TOKEN.
74        let mut meta = HashMap::new();
75        meta.insert(META_IDENTITY_SCOPE.into(), "broad".into());
76        meta.insert(META_IMPLICIT.into(), "true".into());
77        let token_id = graph.add_node_with_metadata(
78            NodeKind::Identity,
79            "CI_JOB_TOKEN",
80            TrustZone::FirstParty,
81            meta,
82        );
83
84        // Top-level include: → mark Partial immediately
85        if mapping.contains_key("include") {
86            graph.mark_partial(
87                "include: directive present — included templates not resolved".to_string(),
88            );
89        }
90
91        // Global variables
92        let global_secrets = process_variables(mapping.get("variables"), &mut graph, "pipeline");
93
94        // Global image
95        let global_image = mapping.get("image").and_then(extract_image_str);
96
97        // Top-level merge_request trigger detection from `workflow:` rules
98        if let Some(wf) = mapping.get("workflow") {
99            if has_mr_trigger_in_workflow(wf) {
100                graph
101                    .metadata
102                    .insert(META_TRIGGER.into(), "merge_request".into());
103            }
104        }
105
106        // Process each job (any top-level key not in RESERVED)
107        for (key, value) in mapping {
108            let job_name = match key.as_str() {
109                Some(k) => k,
110                None => continue,
111            };
112            if RESERVED.contains(&job_name) {
113                continue;
114            }
115
116            // Hidden jobs (starting with a dot) are templates — mark Partial, skip
117            if job_name.starts_with('.') {
118                graph.mark_partial(format!(
119                    "job '{job_name}' is a hidden/template job — not resolved"
120                ));
121                continue;
122            }
123
124            let job_map = match value.as_mapping() {
125                Some(m) => m,
126                None => continue,
127            };
128
129            // extends: — job template inheritance, can't resolve statically
130            if job_map.contains_key("extends") {
131                graph.mark_partial(format!(
132                    "job '{job_name}' uses extends: — inherited configuration not resolved"
133                ));
134            }
135
136            // Detect PR/MR trigger in this job's rules: or only:
137            let job_triggers_mr = job_has_mr_trigger(job_map);
138
139            // Propagate job MR trigger to graph level
140            if job_triggers_mr && !graph.metadata.contains_key(META_TRIGGER) {
141                graph
142                    .metadata
143                    .insert(META_TRIGGER.into(), "merge_request".into());
144            }
145
146            // Job-level variables
147            let job_secrets = process_variables(job_map.get("variables"), &mut graph, job_name);
148
149            // Job-level explicit secrets: (Vault, AWS Secrets Manager, GCP, Azure)
150            let explicit_secrets =
151                process_explicit_secrets(job_map.get("secrets"), job_name, &mut graph);
152
153            // Job-level OIDC tokens (id_tokens:)
154            let oidc_identities = process_id_tokens(job_map.get("id_tokens"), job_name, &mut graph);
155
156            // Job image (falls back to global)
157            let job_image_str = job_map
158                .get("image")
159                .and_then(extract_image_str)
160                .or(global_image.as_deref().map(String::from));
161
162            let image_id = job_image_str.as_deref().map(|img| {
163                let pinned = is_docker_digest_pinned(img);
164                let trust_zone = if pinned {
165                    TrustZone::ThirdParty
166                } else {
167                    TrustZone::Untrusted
168                };
169                let mut imeta = HashMap::new();
170                if let Some(digest) = img.split("@sha256:").nth(1) {
171                    imeta.insert(META_DIGEST.into(), format!("sha256:{digest}"));
172                }
173                graph.add_node_with_metadata(NodeKind::Image, img, trust_zone, imeta)
174            });
175
176            // Services (each is an Image node)
177            let service_ids = process_services(job_map.get("services"), &mut graph);
178
179            // Environment — record name as metadata, sets trust boundary marker
180            let env_name = job_map
181                .get("environment")
182                .and_then(extract_environment_name);
183
184            // Create the Step node for this job
185            let mut step_meta = HashMap::new();
186            step_meta.insert(META_JOB_NAME.into(), job_name.to_string());
187            if let Some(ref env) = env_name {
188                step_meta.insert("environment_name".into(), env.clone());
189            }
190            let step_id = graph.add_node_with_metadata(
191                NodeKind::Step,
192                job_name,
193                TrustZone::FirstParty,
194                step_meta,
195            );
196
197            // CI_JOB_TOKEN always available to every step
198            graph.add_edge(step_id, token_id, EdgeKind::HasAccessTo);
199
200            // Link all secrets
201            for &sid in global_secrets
202                .iter()
203                .chain(&job_secrets)
204                .chain(&explicit_secrets)
205            {
206                graph.add_edge(step_id, sid, EdgeKind::HasAccessTo);
207            }
208
209            // Link OIDC identities
210            for &iid in &oidc_identities {
211                graph.add_edge(step_id, iid, EdgeKind::HasAccessTo);
212            }
213
214            // UsesImage edges
215            if let Some(img_id) = image_id {
216                graph.add_edge(step_id, img_id, EdgeKind::UsesImage);
217            }
218            for &svc_id in &service_ids {
219                graph.add_edge(step_id, svc_id, EdgeKind::UsesImage);
220            }
221        }
222
223        Ok(graph)
224    }
225}
226
227/// Detect `image:` string from a YAML value — can be a bare string or a mapping with `name:`.
228fn extract_image_str(v: &Value) -> Option<String> {
229    match v {
230        Value::String(s) => Some(s.clone()),
231        Value::Mapping(m) => m.get("name").and_then(|n| n.as_str()).map(String::from),
232        _ => None,
233    }
234}
235
236/// Extract environment name from `environment:` value (string or mapping).
237fn extract_environment_name(v: &Value) -> Option<String> {
238    match v {
239        Value::String(s) => Some(s.clone()),
240        Value::Mapping(m) => m.get("name").and_then(|n| n.as_str()).map(String::from),
241        _ => None,
242    }
243}
244
245/// Classify a variable name as a credential by checking for common fragments.
246fn is_credential_name(name: &str) -> bool {
247    let upper = name.to_uppercase();
248    CRED_FRAGMENTS.iter().any(|frag| upper.contains(frag))
249}
250
251/// Parse `variables:` mapping and emit `Secret` nodes for credential-pattern names.
252/// Returns the list of created node IDs.
253fn process_variables(vars: Option<&Value>, graph: &mut AuthorityGraph, scope: &str) -> Vec<NodeId> {
254    let mut ids = Vec::new();
255    let map = match vars.and_then(|v| v.as_mapping()) {
256        Some(m) => m,
257        None => return ids,
258    };
259    for (k, _v) in map {
260        let name = match k.as_str() {
261            Some(s) => s,
262            None => continue,
263        };
264        if is_credential_name(name) {
265            let id = graph.add_node(NodeKind::Secret, name, TrustZone::FirstParty);
266            ids.push(id);
267            let _ = scope; // used for future scoped error messages
268        }
269    }
270    ids
271}
272
273/// Parse `secrets:` block and emit one `Secret` node per named secret.
274///
275/// GitLab CI `secrets:` format:
276/// ```yaml
277/// secrets:
278///   DATABASE_PASSWORD:
279///     vault: production/db/password@secret
280///   AWS_KEY:
281///     aws_secrets_manager:
282///       name: my-secret
283/// ```
284fn process_explicit_secrets(
285    secrets: Option<&Value>,
286    _scope: &str,
287    graph: &mut AuthorityGraph,
288) -> Vec<NodeId> {
289    let mut ids = Vec::new();
290    let map = match secrets.and_then(|v| v.as_mapping()) {
291        Some(m) => m,
292        None => return ids,
293    };
294    for (k, _v) in map {
295        let name = match k.as_str() {
296            Some(s) => s,
297            None => continue,
298        };
299        let id = graph.add_node(NodeKind::Secret, name, TrustZone::FirstParty);
300        ids.push(id);
301    }
302    ids
303}
304
305/// Parse `id_tokens:` block and emit one OIDC `Identity` node per token.
306///
307/// GitLab CI `id_tokens:` format:
308/// ```yaml
309/// id_tokens:
310///   SIGSTORE_ID_TOKEN:
311///     aud: sigstore
312///   AWS_OIDC_TOKEN:
313///     aud: https://sts.amazonaws.com
314/// ```
315fn process_id_tokens(
316    id_tokens: Option<&Value>,
317    _scope: &str,
318    graph: &mut AuthorityGraph,
319) -> Vec<NodeId> {
320    let mut ids = Vec::new();
321    let map = match id_tokens.and_then(|v| v.as_mapping()) {
322        Some(m) => m,
323        None => return ids,
324    };
325    for (k, v) in map {
326        let token_name = match k.as_str() {
327            Some(s) => s,
328            None => continue,
329        };
330        // Extract audience for labelling
331        let aud = v
332            .as_mapping()
333            .and_then(|m| m.get("aud"))
334            .and_then(|a| a.as_str())
335            .unwrap_or("unknown");
336        let label = format!("{token_name} (aud={aud})");
337        let mut meta = HashMap::new();
338        meta.insert(META_OIDC.into(), "true".into());
339        meta.insert(META_IDENTITY_SCOPE.into(), "broad".into());
340        let id =
341            graph.add_node_with_metadata(NodeKind::Identity, label, TrustZone::FirstParty, meta);
342        ids.push(id);
343    }
344    ids
345}
346
347/// Parse `services:` block and emit `Image` nodes.
348fn process_services(services: Option<&Value>, graph: &mut AuthorityGraph) -> Vec<NodeId> {
349    let mut ids = Vec::new();
350    let list = match services.and_then(|v| v.as_sequence()) {
351        Some(s) => s,
352        None => return ids,
353    };
354    for item in list {
355        let img_str = match extract_image_str(item) {
356            Some(s) => s,
357            None => continue,
358        };
359        let pinned = is_docker_digest_pinned(&img_str);
360        let trust_zone = if pinned {
361            TrustZone::ThirdParty
362        } else {
363            TrustZone::Untrusted
364        };
365        let mut meta = HashMap::new();
366        if let Some(digest) = img_str.split("@sha256:").nth(1) {
367            meta.insert(META_DIGEST.into(), format!("sha256:{digest}"));
368        }
369        let id = graph.add_node_with_metadata(NodeKind::Image, &img_str, trust_zone, meta);
370        ids.push(id);
371    }
372    ids
373}
374
375/// Check whether a job's `rules:` or `only:` indicates it runs on merge requests.
376fn job_has_mr_trigger(job_map: &serde_yaml::Mapping) -> bool {
377    // rules: [{if: '$CI_PIPELINE_SOURCE == "merge_request_event"'}]
378    if let Some(rules) = job_map.get("rules").and_then(|v| v.as_sequence()) {
379        for rule in rules {
380            if let Some(if_expr) = rule
381                .as_mapping()
382                .and_then(|m| m.get("if"))
383                .and_then(|v| v.as_str())
384            {
385                if if_expr.contains("merge_request_event") {
386                    return true;
387                }
388            }
389        }
390    }
391    // only: [merge_requests] or only: {refs: [merge_requests]}
392    if let Some(only) = job_map.get("only") {
393        if only_has_merge_requests(only) {
394            return true;
395        }
396    }
397    false
398}
399
400/// Check `only:` value (sequence or mapping) for `merge_requests` entry.
401fn only_has_merge_requests(v: &Value) -> bool {
402    match v {
403        Value::Sequence(seq) => seq
404            .iter()
405            .any(|item| item.as_str() == Some("merge_requests")),
406        Value::Mapping(m) => {
407            if let Some(refs) = m.get("refs").and_then(|r| r.as_sequence()) {
408                return refs
409                    .iter()
410                    .any(|item| item.as_str() == Some("merge_requests"));
411            }
412            false
413        }
414        _ => false,
415    }
416}
417
418/// Check top-level `workflow:` rules for MR trigger.
419fn has_mr_trigger_in_workflow(wf: &Value) -> bool {
420    let rules = match wf
421        .as_mapping()
422        .and_then(|m| m.get("rules"))
423        .and_then(|r| r.as_sequence())
424    {
425        Some(r) => r,
426        None => return false,
427    };
428    for rule in rules {
429        if let Some(if_expr) = rule
430            .as_mapping()
431            .and_then(|m| m.get("if"))
432            .and_then(|v| v.as_str())
433        {
434            if if_expr.contains("merge_request_event") {
435                return true;
436            }
437        }
438    }
439    false
440}
441
442#[cfg(test)]
443mod tests {
444    use super::*;
445
446    fn parse(yaml: &str) -> AuthorityGraph {
447        let parser = GitlabParser;
448        let source = PipelineSource {
449            file: ".gitlab-ci.yml".into(),
450            repo: None,
451            git_ref: None,
452        };
453        parser.parse(yaml, &source).unwrap()
454    }
455
456    #[test]
457    fn ci_job_token_always_present() {
458        let yaml = r#"
459stages:
460  - build
461
462build-job:
463  stage: build
464  script:
465    - make build
466"#;
467        let graph = parse(yaml);
468        let identities: Vec<_> = graph.nodes_of_kind(NodeKind::Identity).collect();
469        assert_eq!(identities.len(), 1);
470        assert_eq!(identities[0].name, "CI_JOB_TOKEN");
471        assert_eq!(
472            identities[0]
473                .metadata
474                .get(META_IMPLICIT)
475                .map(String::as_str),
476            Some("true")
477        );
478        assert_eq!(
479            identities[0]
480                .metadata
481                .get(META_IDENTITY_SCOPE)
482                .map(String::as_str),
483            Some("broad")
484        );
485    }
486
487    #[test]
488    fn global_credential_variable_emits_secret_node() {
489        let yaml = r#"
490variables:
491  APP_VERSION: "1.0"
492  DEPLOY_TOKEN: "$CI_DEPLOY_TOKEN"
493
494build-job:
495  script:
496    - make
497"#;
498        let graph = parse(yaml);
499        let secrets: Vec<_> = graph.nodes_of_kind(NodeKind::Secret).collect();
500        assert!(
501            secrets.iter().any(|s| s.name == "DEPLOY_TOKEN"),
502            "DEPLOY_TOKEN must emit a Secret node, got: {:?}",
503            secrets.iter().map(|s| &s.name).collect::<Vec<_>>()
504        );
505        // Plain config variable must not emit Secret
506        assert!(
507            !secrets.iter().any(|s| s.name == "APP_VERSION"),
508            "APP_VERSION must not emit a Secret node"
509        );
510    }
511
512    #[test]
513    fn floating_image_emits_untrusted_image_node() {
514        let yaml = r#"
515deploy:
516  image: alpine:latest
517  script:
518    - deploy.sh
519"#;
520        let graph = parse(yaml);
521        let images: Vec<_> = graph.nodes_of_kind(NodeKind::Image).collect();
522        assert_eq!(images.len(), 1);
523        assert_eq!(images[0].name, "alpine:latest");
524        assert_eq!(images[0].trust_zone, TrustZone::Untrusted);
525    }
526
527    #[test]
528    fn digest_pinned_image_is_third_party() {
529        let yaml = r#"
530deploy:
531  image: "alpine@sha256:a5ac7e51b41094c92402da3b24376905380afc29a5ac7e51b41094c92402da3b"
532  script:
533    - deploy.sh
534"#;
535        let graph = parse(yaml);
536        let images: Vec<_> = graph.nodes_of_kind(NodeKind::Image).collect();
537        assert_eq!(images.len(), 1);
538        assert_eq!(images[0].trust_zone, TrustZone::ThirdParty);
539    }
540
541    #[test]
542    fn id_tokens_emit_oidc_identity_nodes() {
543        let yaml = r#"
544deploy:
545  id_tokens:
546    SIGSTORE_ID_TOKEN:
547      aud: sigstore
548    AWS_OIDC_TOKEN:
549      aud: https://sts.amazonaws.com
550  script:
551    - deploy.sh
552"#;
553        let graph = parse(yaml);
554        let oidc: Vec<_> = graph
555            .nodes_of_kind(NodeKind::Identity)
556            .filter(|n| n.metadata.get(META_OIDC).map(String::as_str) == Some("true"))
557            .collect();
558        assert_eq!(
559            oidc.len(),
560            2,
561            "expected 2 OIDC identity nodes, got: {:?}",
562            oidc.iter().map(|n| &n.name).collect::<Vec<_>>()
563        );
564    }
565
566    #[test]
567    fn explicit_secrets_emit_secret_nodes() {
568        let yaml = r#"
569deploy:
570  secrets:
571    DATABASE_PASSWORD:
572      vault: production/db/password@secret
573    AWS_KEY:
574      aws_secrets_manager:
575        name: my-secret
576  script:
577    - deploy.sh
578"#;
579        let graph = parse(yaml);
580        let secrets: Vec<_> = graph.nodes_of_kind(NodeKind::Secret).collect();
581        let names: Vec<_> = secrets.iter().map(|s| s.name.as_str()).collect();
582        assert!(names.contains(&"DATABASE_PASSWORD"), "got: {names:?}");
583        assert!(names.contains(&"AWS_KEY"), "got: {names:?}");
584    }
585
586    #[test]
587    fn rules_mr_trigger_sets_meta_trigger() {
588        let yaml = r#"
589test:
590  rules:
591    - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
592  script:
593    - run tests
594"#;
595        let graph = parse(yaml);
596        assert_eq!(
597            graph.metadata.get(META_TRIGGER).map(String::as_str),
598            Some("merge_request"),
599            "META_TRIGGER must be set to merge_request"
600        );
601    }
602
603    #[test]
604    fn only_merge_requests_sets_meta_trigger() {
605        let yaml = r#"
606test:
607  only:
608    - merge_requests
609  script:
610    - run tests
611"#;
612        let graph = parse(yaml);
613        assert_eq!(
614            graph.metadata.get(META_TRIGGER).map(String::as_str),
615            Some("merge_request")
616        );
617    }
618
619    #[test]
620    fn include_marks_graph_partial() {
621        let yaml = r#"
622include:
623  - local: '/templates/.base.yml'
624
625build:
626  script:
627    - make
628"#;
629        let graph = parse(yaml);
630        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
631    }
632
633    #[test]
634    fn extends_marks_graph_partial() {
635        let yaml = r#"
636.base:
637  script:
638    - echo base
639
640my-job:
641  extends: .base
642  stage: build
643"#;
644        let graph = parse(yaml);
645        assert_eq!(graph.completeness, AuthorityCompleteness::Partial);
646    }
647
648    #[test]
649    fn meta_job_name_set_on_step_nodes() {
650        let yaml = r#"
651build:
652  script:
653    - make
654deploy:
655  script:
656    - deploy.sh
657"#;
658        let graph = parse(yaml);
659        let steps: Vec<_> = graph.nodes_of_kind(NodeKind::Step).collect();
660        assert_eq!(steps.len(), 2);
661        for step in &steps {
662            assert!(
663                step.metadata.contains_key(META_JOB_NAME),
664                "Step '{}' missing META_JOB_NAME",
665                step.name
666            );
667        }
668        // Verify job names are correct
669        let names: Vec<_> = steps
670            .iter()
671            .map(|s| s.metadata.get(META_JOB_NAME).unwrap().as_str())
672            .collect();
673        assert!(names.contains(&"build"), "got: {names:?}");
674        assert!(names.contains(&"deploy"), "got: {names:?}");
675    }
676
677    #[test]
678    fn reserved_keywords_not_parsed_as_jobs() {
679        let yaml = r#"
680stages:
681  - build
682  - test
683
684variables:
685  MY_VAR: value
686
687image: alpine:latest
688
689build:
690  stage: build
691  script:
692    - make
693"#;
694        let graph = parse(yaml);
695        let steps: Vec<_> = graph.nodes_of_kind(NodeKind::Step).collect();
696        assert_eq!(
697            steps.len(),
698            1,
699            "only 'build' should be a Step, got: {:?}",
700            steps.iter().map(|s| &s.name).collect::<Vec<_>>()
701        );
702        assert_eq!(steps[0].name, "build");
703    }
704
705    #[test]
706    fn services_emit_image_nodes() {
707        let yaml = r#"
708test:
709  services:
710    - docker:dind
711    - name: postgres:14
712  script:
713    - run_tests
714"#;
715        let graph = parse(yaml);
716        let images: Vec<_> = graph.nodes_of_kind(NodeKind::Image).collect();
717        assert_eq!(
718            images.len(),
719            2,
720            "expected 2 service Image nodes, got: {:?}",
721            images.iter().map(|i| &i.name).collect::<Vec<_>>()
722        );
723    }
724}