openclaw-scan 0.1.1

Security scanner for agentic AI framework installations (OpenClaw, Claude Code, and compatible)
Documentation
//! Secret and credential detection scanner.
//!
//! Scans all text files inside the installation root for patterns that match
//! known API key formats from every major AI provider and common infrastructure
//! services.  Evidence is always **redacted** before being stored.

use std::path::Path;

use anyhow::Result;
use once_cell::sync::Lazy;
use regex::Regex;
use walkdir::WalkDir;

use crate::finding::{redact, Category, Finding, Severity};
use crate::scanner::{ScanContext, Scanner};

// ── Regex patterns ────────────────────────────────────────────────────────────

struct SecretPattern {
    name: &'static str,
    severity: Severity,
    /// Characters to keep in the redacted evidence prefix.
    keep: usize,
    re: Regex,
}

static PATTERNS: Lazy<Vec<SecretPattern>> = Lazy::new(|| {
    vec![
        // ── AI Provider Keys ────────────────────────────────────────────────
        SecretPattern {
            name: "Anthropic API key",
            severity: Severity::Critical,
            keep: 10,
            // Relaxed subtype to catch future formats (e.g. sk-ant-sid01-, sk-ant-key01-)
            re: Regex::new(r"sk-ant-[a-z]{2,8}\d{0,6}-[A-Za-z0-9_-]{20,}").unwrap(),
        },
        SecretPattern {
            name: "OpenAI API key",
            severity: Severity::Critical,
            keep: 7,
            re: Regex::new(r"sk-(?:proj-)?[A-Za-z0-9_-]{20}T3BlbkFJ[A-Za-z0-9_-]{20}").unwrap(),
        },
        SecretPattern {
            name: "OpenAI project key",
            severity: Severity::Critical,
            keep: 8,
            re: Regex::new(r"sk-proj-[A-Za-z0-9_-]{48,}").unwrap(),
        },
        SecretPattern {
            name: "xAI / Grok API key",
            severity: Severity::Critical,
            keep: 4,
            re: Regex::new(r"xai-[A-Za-z0-9_-]{32,}").unwrap(),
        },
        SecretPattern {
            name: "OpenRouter API key",
            severity: Severity::Critical,
            keep: 9,
            re: Regex::new(r"sk-or-v1-[A-Za-z0-9_-]{48,}").unwrap(),
        },
        SecretPattern {
            name: "Google AI / Gemini key",
            severity: Severity::Critical,
            keep: 4,
            re: Regex::new(r"AIza[0-9A-Za-z\-_]{35}").unwrap(),
        },
        SecretPattern {
            name: "Hugging Face token",
            severity: Severity::High,
            keep: 3,
            re: Regex::new(r"hf_[A-Za-z0-9]{34}").unwrap(),
        },
        // ── Cloud / Infrastructure ───────────────────────────────────────────
        SecretPattern {
            name: "AWS access key ID",
            severity: Severity::Critical,
            keep: 4,
            re: Regex::new(r"(?:A3T[A-Z0-9]|AKIA|ASIA|ABIA|ACCA)[A-Z0-9]{16}").unwrap(),
        },
        SecretPattern {
            name: "GitHub personal access token",
            severity: Severity::High,
            keep: 4,
            re: Regex::new(r"ghp_[0-9a-zA-Z]{36}").unwrap(),
        },
        SecretPattern {
            name: "GitHub OAuth token",
            severity: Severity::High,
            keep: 4,
            re: Regex::new(r"gho_[0-9a-zA-Z]{36}").unwrap(),
        },
        SecretPattern {
            name: "GitHub fine-grained PAT",
            severity: Severity::High,
            keep: 11,
            re: Regex::new(r"github_pat_[0-9a-zA-Z_]{82}").unwrap(),
        },
        SecretPattern {
            name: "GitLab personal access token",
            severity: Severity::High,
            keep: 7,
            re: Regex::new(r"glpat-[0-9a-zA-Z\-_]{20}").unwrap(),
        },
        // ── Credentials & Keys ──────────────────────────────────────────────
        SecretPattern {
            name: "PEM private key",
            severity: Severity::Critical,
            keep: 11,
            re: Regex::new(r"-----BEGIN (?:RSA |DSA |EC |OPENSSH )?PRIVATE KEY").unwrap(),
        },
        SecretPattern {
            name: "JWT token",
            severity: Severity::High,
            keep: 5,
            re: Regex::new(r"eyJ[A-Za-z0-9_-]{4,}\.eyJ[A-Za-z0-9_-]{4,}\.[A-Za-z0-9_-]{4,}").unwrap(),
        },
        SecretPattern {
            name: "Database connection string with credentials",
            severity: Severity::High,
            keep: 8,
            re: Regex::new(r"(?i)(?:postgres|mysql|mongodb|redis)://[^:@\s]{1,64}:[^@\s]{1,64}@").unwrap(),
        },
        SecretPattern {
            name: "Generic high-entropy secret",
            severity: Severity::Medium,
            keep: 6,
            // Capture group 1 isolates the credential value so evidence redaction
            // operates on the secret itself, not the surrounding key=value text.
            re: Regex::new(
                r#"(?i)(?:api[_-]?key|secret[_-]?key?|auth[_-]?token|access[_-]?token|password|passwd|pwd)\s*[=:]\s*['"]?([A-Za-z0-9/+!@#$%^&*]{32,})['"]?"#,
            ).unwrap(),
        },
        // NOTE: capture group 1 in the generic pattern above is handled specially
        // in scan_content — captures().get(1) returns the credential, not the full match.
    ]
});

// ── Files to skip ─────────────────────────────────────────────────────────────

/// File extensions we skip — binary files that can't contain text secrets.
const SKIP_EXTENSIONS: &[&str] = &[
    "png", "jpg", "jpeg", "gif", "bmp", "ico", "webp", "svg", "mp3", "mp4", "wav", "ogg", "flac",
    "zip", "gz", "bz2", "tar", "xz", "7z", "pdf", "doc", "docx", "xls", "xlsx", "bin", "exe",
    "dll", "so", "dylib", "wasm", "class",
];

/// Maximum file size to scan (8 MB). Larger files are skipped to keep runtime low.
const MAX_FILE_SIZE: u64 = 8 * 1024 * 1024;

// ── SecretsScanner ────────────────────────────────────────────────────────────

pub struct SecretsScanner;

impl Scanner for SecretsScanner {
    fn name(&self) -> &'static str {
        "secrets"
    }

    fn scan(&self, ctx: &ScanContext) -> Result<Vec<Finding>> {
        let mut findings = Vec::new();
        scan_dir(&ctx.root, &mut findings);
        Ok(findings)
    }
}

fn scan_dir(root: &Path, findings: &mut Vec<Finding>) {
    for entry in WalkDir::new(root)
        .follow_links(false)
        .into_iter()
        .filter_map(|e| e.ok())
        .filter(|e| e.file_type().is_file())
    {
        let path = entry.path();

        // Skip binary file types.
        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
            if SKIP_EXTENSIONS.contains(&ext.to_lowercase().as_str()) {
                continue;
            }
        }

        // Skip oversized files.
        if let Ok(meta) = entry.metadata() {
            if meta.len() > MAX_FILE_SIZE {
                continue;
            }
        }

        // Use read + from_utf8_lossy so non-UTF-8 files are scanned too (H-3).
        if let Ok(bytes) = std::fs::read(path) {
            let content = String::from_utf8_lossy(&bytes);
            scan_content(&content, path, findings);
        }
    }
}

fn scan_content(content: &str, path: &Path, findings: &mut Vec<Finding>) {
    for (line_no, line) in content.lines().enumerate() {
        for pattern in PATTERNS.iter() {
            // Use captures() for all patterns: group 1 (if present) isolates the
            // credential value so redaction operates on the secret itself (C-1 fix).
            // Group 0 is the full match, used for patterns without a capture group.
            let Some(caps) = pattern.re.captures(line) else {
                continue;
            };
            let matched = caps.get(1).or_else(|| caps.get(0));
            let Some(m) = matched else { continue };

            let evidence = redact(m.as_str(), pattern.keep);
            findings.push(
                Finding::new(
                    pattern.severity,
                    Category::SecretDetection,
                    format!("{} detected", pattern.name),
                    format!(
                        "A {} was found in '{}'. This credential may have been \
                         pasted into a conversation or written by an agent and \
                         is now stored in plain text.",
                        pattern.name,
                        path.display()
                    ),
                    path,
                    "Rotate this credential immediately. Remove the file or \
                     redact the line. Consider running `ocls` again after \
                     rotation to verify the credential no longer appears.",
                )
                .with_line(line_no + 1)
                .with_evidence(evidence),
            );
            // One finding per pattern per line is enough.
            break;
        }
    }
}

// ── Tests ─────────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;
    use std::path::PathBuf;

    fn findings_for(content: &str) -> Vec<Finding> {
        let mut findings = Vec::new();
        scan_content(content, &PathBuf::from("/test/file.json"), &mut findings);
        findings
    }

    // ── AI provider key detection ─────────────────────────────────────────────

    #[test]
    fn detects_anthropic_key() {
        let content = r#"{"token": "sk-ant-api03-abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGH"}"#;
        let findings = findings_for(content);
        assert!(!findings.is_empty(), "should detect Anthropic key");
        assert_eq!(findings[0].severity, Severity::Critical);
        assert!(findings[0].title.contains("Anthropic"));
    }

    #[test]
    fn detects_openai_key() {
        let content = r#"api_key = "sk-proj-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz""#;
        let findings = findings_for(content);
        assert!(!findings.is_empty(), "should detect OpenAI project key");
        assert_eq!(findings[0].severity, Severity::Critical);
    }

    #[test]
    fn detects_xai_key() {
        let content = r#"key = "xai-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefgh""#;
        let findings = findings_for(content);
        assert!(!findings.is_empty(), "should detect xAI key");
        assert_eq!(findings[0].severity, Severity::Critical);
    }

    #[test]
    fn detects_openrouter_key() {
        let content =
            r#"key = "sk-or-v1-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12345678""#;
        let findings = findings_for(content);
        assert!(!findings.is_empty(), "should detect OpenRouter key");
    }

    #[test]
    fn detects_google_ai_key() {
        let content = r#"key = "AIzaABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghi""#;
        let findings = findings_for(content);
        assert!(!findings.is_empty(), "should detect Google AI key");
    }

    #[test]
    fn detects_aws_key() {
        let content = "access_key = AKIAIOSFODNN7EXAMPLE";
        let findings = findings_for(content);
        assert!(!findings.is_empty(), "should detect AWS key");
        assert_eq!(findings[0].severity, Severity::Critical);
    }

    #[test]
    fn detects_github_pat() {
        let content = "token = ghp_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij";
        let findings = findings_for(content);
        assert!(!findings.is_empty(), "should detect GitHub PAT");
        assert_eq!(findings[0].severity, Severity::High);
    }

    #[test]
    fn detects_gitlab_token() {
        let content = "token = glpat-abcdefghijklmnopqrst";
        let findings = findings_for(content);
        assert!(!findings.is_empty(), "should detect GitLab token");
    }

    #[test]
    fn detects_private_key_header() {
        let content = "-----BEGIN RSA PRIVATE KEY-----\nMIIEo...";
        let findings = findings_for(content);
        assert!(!findings.is_empty(), "should detect PEM private key");
        assert_eq!(findings[0].severity, Severity::Critical);
    }

    #[test]
    fn detects_database_url_with_credentials() {
        let content = r#"DATABASE_URL=postgres://admin:supersecret@localhost:5432/mydb"#;
        let findings = findings_for(content);
        assert!(!findings.is_empty(), "should detect DB connection string");
    }

    // ── No false positives ────────────────────────────────────────────────────

    #[test]
    fn no_false_positive_on_empty_line() {
        assert!(findings_for("").is_empty());
    }

    #[test]
    fn no_false_positive_on_safe_json() {
        let content = r#"{"model": "claude-3-5-sonnet-20241022", "max_tokens": 4096}"#;
        assert!(findings_for(content).is_empty());
    }

    #[test]
    fn no_false_positive_on_env_variable_reference() {
        // Env var references are not credentials
        let content = r#"api_key = "${OPENAI_API_KEY}""#;
        // Generic pattern might fire for this; the result is acceptable either way,
        // but the value captured must be redacted and short enough not to be a real key.
        // We just verify it doesn't panic.
        let _findings = findings_for(content);
    }

    // ── Evidence is always redacted ───────────────────────────────────────────

    #[test]
    fn evidence_is_redacted() {
        let content = "token = AKIAIOSFODNN7EXAMPLE";
        let findings = findings_for(content);
        if let Some(ev) = findings.first().and_then(|f| f.evidence.as_deref()) {
            assert!(ev.contains("****"), "evidence must be redacted: {}", ev);
            assert!(
                !ev.contains("EXAMPLE"),
                "evidence must not contain full secret"
            );
        }
    }

    // ── Line numbers ──────────────────────────────────────────────────────────

    #[test]
    fn finding_has_correct_line_number() {
        let content = "normal line\ntoken = AKIAIOSFODNN7EXAMPLE\nanother line";
        let findings = findings_for(content);
        assert!(!findings.is_empty());
        assert_eq!(findings[0].line, Some(2));
    }
}