Skip to main content

openclaw_scan/scanner/
secrets.rs

1//! Secret and credential detection scanner.
2//!
3//! Scans all text files inside the installation root for patterns that match
4//! known API key formats from every major AI provider and common infrastructure
5//! services.  Evidence is always **redacted** before being stored.
6
7use std::path::Path;
8
9use anyhow::Result;
10use once_cell::sync::Lazy;
11use regex::Regex;
12use walkdir::WalkDir;
13
14use crate::finding::{redact, Category, Finding, Severity};
15use crate::scanner::{ScanContext, Scanner};
16
17// ── Regex patterns ────────────────────────────────────────────────────────────
18
19struct SecretPattern {
20    name: &'static str,
21    severity: Severity,
22    /// Characters to keep in the redacted evidence prefix.
23    keep: usize,
24    re: Regex,
25}
26
27static PATTERNS: Lazy<Vec<SecretPattern>> = Lazy::new(|| {
28    vec![
29        // ── AI Provider Keys ────────────────────────────────────────────────
30        SecretPattern {
31            name: "Anthropic API key",
32            severity: Severity::Critical,
33            keep: 10,
34            // Relaxed subtype to catch future formats (e.g. sk-ant-sid01-, sk-ant-key01-)
35            re: Regex::new(r"sk-ant-[a-z]{2,8}\d{0,6}-[A-Za-z0-9_-]{20,}").unwrap(),
36        },
37        SecretPattern {
38            name: "OpenAI API key",
39            severity: Severity::Critical,
40            keep: 7,
41            re: Regex::new(r"sk-(?:proj-)?[A-Za-z0-9_-]{20}T3BlbkFJ[A-Za-z0-9_-]{20}").unwrap(),
42        },
43        SecretPattern {
44            name: "OpenAI project key",
45            severity: Severity::Critical,
46            keep: 8,
47            re: Regex::new(r"sk-proj-[A-Za-z0-9_-]{48,}").unwrap(),
48        },
49        SecretPattern {
50            name: "xAI / Grok API key",
51            severity: Severity::Critical,
52            keep: 4,
53            re: Regex::new(r"xai-[A-Za-z0-9_-]{32,}").unwrap(),
54        },
55        SecretPattern {
56            name: "OpenRouter API key",
57            severity: Severity::Critical,
58            keep: 9,
59            re: Regex::new(r"sk-or-v1-[A-Za-z0-9_-]{48,}").unwrap(),
60        },
61        SecretPattern {
62            name: "Google AI / Gemini key",
63            severity: Severity::Critical,
64            keep: 4,
65            re: Regex::new(r"AIza[0-9A-Za-z\-_]{35}").unwrap(),
66        },
67        SecretPattern {
68            name: "Hugging Face token",
69            severity: Severity::High,
70            keep: 3,
71            re: Regex::new(r"hf_[A-Za-z0-9]{34}").unwrap(),
72        },
73        // ── Cloud / Infrastructure ───────────────────────────────────────────
74        SecretPattern {
75            name: "AWS access key ID",
76            severity: Severity::Critical,
77            keep: 4,
78            re: Regex::new(r"(?:A3T[A-Z0-9]|AKIA|ASIA|ABIA|ACCA)[A-Z0-9]{16}").unwrap(),
79        },
80        SecretPattern {
81            name: "GitHub personal access token",
82            severity: Severity::High,
83            keep: 4,
84            re: Regex::new(r"ghp_[0-9a-zA-Z]{36}").unwrap(),
85        },
86        SecretPattern {
87            name: "GitHub OAuth token",
88            severity: Severity::High,
89            keep: 4,
90            re: Regex::new(r"gho_[0-9a-zA-Z]{36}").unwrap(),
91        },
92        SecretPattern {
93            name: "GitHub fine-grained PAT",
94            severity: Severity::High,
95            keep: 11,
96            re: Regex::new(r"github_pat_[0-9a-zA-Z_]{82}").unwrap(),
97        },
98        SecretPattern {
99            name: "GitLab personal access token",
100            severity: Severity::High,
101            keep: 7,
102            re: Regex::new(r"glpat-[0-9a-zA-Z\-_]{20}").unwrap(),
103        },
104        // ── Credentials & Keys ──────────────────────────────────────────────
105        SecretPattern {
106            name: "PEM private key",
107            severity: Severity::Critical,
108            keep: 11,
109            re: Regex::new(r"-----BEGIN (?:RSA |DSA |EC |OPENSSH )?PRIVATE KEY").unwrap(),
110        },
111        SecretPattern {
112            name: "JWT token",
113            severity: Severity::High,
114            keep: 5,
115            re: Regex::new(r"eyJ[A-Za-z0-9_-]{4,}\.eyJ[A-Za-z0-9_-]{4,}\.[A-Za-z0-9_-]{4,}").unwrap(),
116        },
117        SecretPattern {
118            name: "Database connection string with credentials",
119            severity: Severity::High,
120            keep: 8,
121            re: Regex::new(r"(?i)(?:postgres|mysql|mongodb|redis)://[^:@\s]{1,64}:[^@\s]{1,64}@").unwrap(),
122        },
123        SecretPattern {
124            name: "Generic high-entropy secret",
125            severity: Severity::Medium,
126            keep: 6,
127            // Capture group 1 isolates the credential value so evidence redaction
128            // operates on the secret itself, not the surrounding key=value text.
129            re: Regex::new(
130                r#"(?i)(?:api[_-]?key|secret[_-]?key?|auth[_-]?token|access[_-]?token|password|passwd|pwd)\s*[=:]\s*['"]?([A-Za-z0-9/+!@#$%^&*]{32,})['"]?"#,
131            ).unwrap(),
132        },
133        // NOTE: capture group 1 in the generic pattern above is handled specially
134        // in scan_content — captures().get(1) returns the credential, not the full match.
135    ]
136});
137
138// ── Files to skip ─────────────────────────────────────────────────────────────
139
140/// File extensions we skip — binary files that can't contain text secrets.
141const SKIP_EXTENSIONS: &[&str] = &[
142    "png", "jpg", "jpeg", "gif", "bmp", "ico", "webp", "svg", "mp3", "mp4", "wav", "ogg", "flac",
143    "zip", "gz", "bz2", "tar", "xz", "7z", "pdf", "doc", "docx", "xls", "xlsx", "bin", "exe",
144    "dll", "so", "dylib", "wasm", "class",
145];
146
147/// Maximum file size to scan (8 MB). Larger files are skipped to keep runtime low.
148const MAX_FILE_SIZE: u64 = 8 * 1024 * 1024;
149
150// ── SecretsScanner ────────────────────────────────────────────────────────────
151
152pub struct SecretsScanner;
153
154impl Scanner for SecretsScanner {
155    fn name(&self) -> &'static str {
156        "secrets"
157    }
158
159    fn scan(&self, ctx: &ScanContext) -> Result<Vec<Finding>> {
160        let mut findings = Vec::new();
161        scan_dir(&ctx.root, &mut findings);
162        Ok(findings)
163    }
164}
165
166fn scan_dir(root: &Path, findings: &mut Vec<Finding>) {
167    for entry in WalkDir::new(root)
168        .follow_links(false)
169        .into_iter()
170        .filter_map(|e| e.ok())
171        .filter(|e| e.file_type().is_file())
172    {
173        let path = entry.path();
174
175        // Skip binary file types.
176        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
177            if SKIP_EXTENSIONS.contains(&ext.to_lowercase().as_str()) {
178                continue;
179            }
180        }
181
182        // Skip oversized files.
183        if let Ok(meta) = entry.metadata() {
184            if meta.len() > MAX_FILE_SIZE {
185                continue;
186            }
187        }
188
189        // Use read + from_utf8_lossy so non-UTF-8 files are scanned too (H-3).
190        if let Ok(bytes) = std::fs::read(path) {
191            let content = String::from_utf8_lossy(&bytes);
192            scan_content(&content, path, findings);
193        }
194    }
195}
196
197fn scan_content(content: &str, path: &Path, findings: &mut Vec<Finding>) {
198    for (line_no, line) in content.lines().enumerate() {
199        for pattern in PATTERNS.iter() {
200            // Use captures() for all patterns: group 1 (if present) isolates the
201            // credential value so redaction operates on the secret itself (C-1 fix).
202            // Group 0 is the full match, used for patterns without a capture group.
203            let Some(caps) = pattern.re.captures(line) else {
204                continue;
205            };
206            let matched = caps.get(1).or_else(|| caps.get(0));
207            let Some(m) = matched else { continue };
208
209            let evidence = redact(m.as_str(), pattern.keep);
210            findings.push(
211                Finding::new(
212                    pattern.severity,
213                    Category::SecretDetection,
214                    format!("{} detected", pattern.name),
215                    format!(
216                        "A {} was found in '{}'. This credential may have been \
217                         pasted into a conversation or written by an agent and \
218                         is now stored in plain text.",
219                        pattern.name,
220                        path.display()
221                    ),
222                    path,
223                    "Rotate this credential immediately. Remove the file or \
224                     redact the line. Consider running `ocls` again after \
225                     rotation to verify the credential no longer appears.",
226                )
227                .with_line(line_no + 1)
228                .with_evidence(evidence),
229            );
230            // One finding per pattern per line is enough.
231            break;
232        }
233    }
234}
235
236// ── Tests ─────────────────────────────────────────────────────────────────────
237
238#[cfg(test)]
239mod tests {
240    use super::*;
241    use std::path::PathBuf;
242
243    fn findings_for(content: &str) -> Vec<Finding> {
244        let mut findings = Vec::new();
245        scan_content(content, &PathBuf::from("/test/file.json"), &mut findings);
246        findings
247    }
248
249    // ── AI provider key detection ─────────────────────────────────────────────
250
251    #[test]
252    fn detects_anthropic_key() {
253        let content = r#"{"token": "sk-ant-api03-abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGH"}"#;
254        let findings = findings_for(content);
255        assert!(!findings.is_empty(), "should detect Anthropic key");
256        assert_eq!(findings[0].severity, Severity::Critical);
257        assert!(findings[0].title.contains("Anthropic"));
258    }
259
260    #[test]
261    fn detects_openai_key() {
262        let content = r#"api_key = "sk-proj-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz""#;
263        let findings = findings_for(content);
264        assert!(!findings.is_empty(), "should detect OpenAI project key");
265        assert_eq!(findings[0].severity, Severity::Critical);
266    }
267
268    #[test]
269    fn detects_xai_key() {
270        let content = r#"key = "xai-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefgh""#;
271        let findings = findings_for(content);
272        assert!(!findings.is_empty(), "should detect xAI key");
273        assert_eq!(findings[0].severity, Severity::Critical);
274    }
275
276    #[test]
277    fn detects_openrouter_key() {
278        let content =
279            r#"key = "sk-or-v1-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12345678""#;
280        let findings = findings_for(content);
281        assert!(!findings.is_empty(), "should detect OpenRouter key");
282    }
283
284    #[test]
285    fn detects_google_ai_key() {
286        let content = r#"key = "AIzaABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghi""#;
287        let findings = findings_for(content);
288        assert!(!findings.is_empty(), "should detect Google AI key");
289    }
290
291    #[test]
292    fn detects_aws_key() {
293        let content = "access_key = AKIAIOSFODNN7EXAMPLE";
294        let findings = findings_for(content);
295        assert!(!findings.is_empty(), "should detect AWS key");
296        assert_eq!(findings[0].severity, Severity::Critical);
297    }
298
299    #[test]
300    fn detects_github_pat() {
301        let content = "token = ghp_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij";
302        let findings = findings_for(content);
303        assert!(!findings.is_empty(), "should detect GitHub PAT");
304        assert_eq!(findings[0].severity, Severity::High);
305    }
306
307    #[test]
308    fn detects_gitlab_token() {
309        let content = "token = glpat-abcdefghijklmnopqrst";
310        let findings = findings_for(content);
311        assert!(!findings.is_empty(), "should detect GitLab token");
312    }
313
314    #[test]
315    fn detects_private_key_header() {
316        let content = "-----BEGIN RSA PRIVATE KEY-----\nMIIEo...";
317        let findings = findings_for(content);
318        assert!(!findings.is_empty(), "should detect PEM private key");
319        assert_eq!(findings[0].severity, Severity::Critical);
320    }
321
322    #[test]
323    fn detects_database_url_with_credentials() {
324        let content = r#"DATABASE_URL=postgres://admin:supersecret@localhost:5432/mydb"#;
325        let findings = findings_for(content);
326        assert!(!findings.is_empty(), "should detect DB connection string");
327    }
328
329    // ── No false positives ────────────────────────────────────────────────────
330
331    #[test]
332    fn no_false_positive_on_empty_line() {
333        assert!(findings_for("").is_empty());
334    }
335
336    #[test]
337    fn no_false_positive_on_safe_json() {
338        let content = r#"{"model": "claude-3-5-sonnet-20241022", "max_tokens": 4096}"#;
339        assert!(findings_for(content).is_empty());
340    }
341
342    #[test]
343    fn no_false_positive_on_env_variable_reference() {
344        // Env var references are not credentials
345        let content = r#"api_key = "${OPENAI_API_KEY}""#;
346        // Generic pattern might fire for this; the result is acceptable either way,
347        // but the value captured must be redacted and short enough not to be a real key.
348        // We just verify it doesn't panic.
349        let _findings = findings_for(content);
350    }
351
352    // ── Evidence is always redacted ───────────────────────────────────────────
353
354    #[test]
355    fn evidence_is_redacted() {
356        let content = "token = AKIAIOSFODNN7EXAMPLE";
357        let findings = findings_for(content);
358        if let Some(ev) = findings.first().and_then(|f| f.evidence.as_deref()) {
359            assert!(ev.contains("****"), "evidence must be redacted: {}", ev);
360            assert!(
361                !ev.contains("EXAMPLE"),
362                "evidence must not contain full secret"
363            );
364        }
365    }
366
367    // ── Line numbers ──────────────────────────────────────────────────────────
368
369    #[test]
370    fn finding_has_correct_line_number() {
371        let content = "normal line\ntoken = AKIAIOSFODNN7EXAMPLE\nanother line";
372        let findings = findings_for(content);
373        assert!(!findings.is_empty());
374        assert_eq!(findings[0].line, Some(2));
375    }
376}