Skip to main content

harn_vm/stdlib/
secret_scan.rs

1use std::collections::{BTreeMap, BTreeSet};
2use std::sync::LazyLock;
3
4use regex::Regex;
5use serde::{Deserialize, Serialize};
6use sha2::Digest;
7
8use crate::event_log::{active_event_log, EventLog, LogEvent, Topic};
9use crate::value::{VmError, VmValue};
10use crate::vm::Vm;
11
12pub const SECRET_SCAN_AUDIT_TOPIC: &str = "audit.secret_scan";
13const HIGH_ENTROPY_THRESHOLD: f64 = 3.5;
14
15#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
16pub struct SecretFinding {
17    pub detector: String,
18    pub source: String,
19    pub title: String,
20    pub line: usize,
21    pub column_start: usize,
22    pub column_end: usize,
23    pub start_offset: usize,
24    pub end_offset: usize,
25    pub redacted: String,
26    pub fingerprint: String,
27}
28
29struct SecretRule {
30    detector: &'static str,
31    source: &'static str,
32    title: &'static str,
33    regex: Regex,
34}
35
36static SECRET_RULES: LazyLock<Vec<SecretRule>> = LazyLock::new(|| {
37    vec![
38        SecretRule {
39            detector: "aws-access-key-id",
40            source: "gitleaks",
41            title: "AWS access key id",
42            regex: Regex::new(r"\b(?:AKIA|ASIA|AGPA|AIDA|ANPA|AROA|AIPA)[A-Z0-9]{16}\b").unwrap(),
43        },
44        SecretRule {
45            detector: "github-token",
46            source: "gitleaks",
47            title: "GitHub token",
48            regex: Regex::new(r"\bgh(?:p|o|u|s|r)_[A-Za-z0-9]{36,255}\b").unwrap(),
49        },
50        SecretRule {
51            detector: "github-fine-grained-token",
52            source: "gitleaks",
53            title: "GitHub fine-grained personal access token",
54            regex: Regex::new(r"\bgithub_pat_[A-Za-z0-9_]{20,255}\b").unwrap(),
55        },
56        SecretRule {
57            detector: "gitlab-token",
58            source: "detect-secrets",
59            title: "GitLab personal access token",
60            regex: Regex::new(r"\bglpat-[A-Za-z0-9_-]{20,255}\b").unwrap(),
61        },
62        SecretRule {
63            detector: "npm-token",
64            source: "detect-secrets",
65            title: "npm access token",
66            regex: Regex::new(r"\bnpm_[A-Za-z0-9]{36}\b").unwrap(),
67        },
68        SecretRule {
69            detector: "openai-api-key",
70            source: "detect-secrets",
71            title: "OpenAI API key",
72            regex: Regex::new(r"\bsk-[A-Za-z0-9_-]{20,255}\b").unwrap(),
73        },
74        SecretRule {
75            detector: "slack-token",
76            source: "trufflehog",
77            title: "Slack token",
78            regex: Regex::new(r"\bxox(?:a|b|p|r|s)-[A-Za-z0-9-]{10,255}\b").unwrap(),
79        },
80        SecretRule {
81            detector: "stripe-secret-key",
82            source: "trufflehog",
83            title: "Stripe secret or restricted key",
84            regex: Regex::new(r"\b(?:rk|sk)_(?:live|test)_[0-9A-Za-z]{16,255}\b").unwrap(),
85        },
86        SecretRule {
87            detector: "private-key-block",
88            source: "detect-secrets",
89            title: "Private key block",
90            regex: Regex::new(r"(?m)^-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----$").unwrap(),
91        },
92    ]
93});
94
95static HIGH_ENTROPY_ASSIGNMENT_RULE: LazyLock<Regex> = LazyLock::new(|| {
96    Regex::new(
97        r#"(?im)(?:secret|token|api[_-]?key|access[_-]?key|password|passwd|pwd|client[_-]?secret|private[_-]?key)[^\n:=]{0,32}(?::|=)\s*["']([A-Za-z0-9+/=_\.-]{20,})["']"#,
98    )
99    .unwrap()
100});
101
102pub fn scan_content(content: &str) -> Vec<SecretFinding> {
103    let line_starts = line_starts(content);
104    let mut findings = Vec::new();
105
106    for rule in SECRET_RULES.iter() {
107        for mat in rule.regex.find_iter(content) {
108            findings.push(build_finding(
109                content,
110                &line_starts,
111                rule.detector,
112                rule.source,
113                rule.title,
114                mat.start(),
115                mat.end(),
116                mat.as_str(),
117            ));
118        }
119    }
120
121    for captures in HIGH_ENTROPY_ASSIGNMENT_RULE.captures_iter(content) {
122        let Some(secret) = captures.get(1) else {
123            continue;
124        };
125        if shannon_entropy(secret.as_str()) < HIGH_ENTROPY_THRESHOLD {
126            continue;
127        }
128        findings.push(build_finding(
129            content,
130            &line_starts,
131            "high-entropy-credential-assignment",
132            "trufflehog",
133            "High-entropy secret assignment",
134            secret.start(),
135            secret.end(),
136            secret.as_str(),
137        ));
138    }
139
140    findings.sort_by(|left, right| {
141        left.start_offset
142            .cmp(&right.start_offset)
143            .then(left.end_offset.cmp(&right.end_offset))
144            .then(left.detector.cmp(&right.detector))
145    });
146    let specific_spans: BTreeSet<(usize, usize)> = findings
147        .iter()
148        .filter(|finding| finding.detector != "high-entropy-credential-assignment")
149        .map(|finding| (finding.start_offset, finding.end_offset))
150        .collect();
151    findings.retain(|finding| {
152        finding.detector != "high-entropy-credential-assignment"
153            || !specific_spans.contains(&(finding.start_offset, finding.end_offset))
154    });
155    findings.dedup_by(|left, right| {
156        left.detector == right.detector
157            && left.start_offset == right.start_offset
158            && left.end_offset == right.end_offset
159    });
160    findings
161}
162
163pub async fn append_secret_scan_audit<L: EventLog + ?Sized>(
164    event_log: &L,
165    caller: &str,
166    content_len: usize,
167    findings: &[SecretFinding],
168) -> Result<(), crate::event_log::LogError> {
169    let payload = serde_json::json!({
170        "caller": caller,
171        "content_len": content_len,
172        "finding_count": findings.len(),
173        "clean": findings.is_empty(),
174        "findings": findings
175            .iter()
176            .map(|finding| {
177                serde_json::json!({
178                    "detector": finding.detector,
179                    "source": finding.source,
180                    "title": finding.title,
181                    "line": finding.line,
182                    "column_start": finding.column_start,
183                    "column_end": finding.column_end,
184                    "start_offset": finding.start_offset,
185                    "end_offset": finding.end_offset,
186                    "fingerprint": finding.fingerprint,
187                    "redacted": finding.redacted,
188                })
189            })
190            .collect::<Vec<_>>(),
191        "observed_at": crate::orchestration::now_rfc3339(),
192    });
193    let topic = Topic::new(SECRET_SCAN_AUDIT_TOPIC).expect("secret scan audit topic is valid");
194    let kind = if findings.is_empty() {
195        "scan_clean"
196    } else {
197        "scan_detected"
198    };
199    event_log
200        .append(&topic, LogEvent::new(kind, payload))
201        .await?;
202    Ok(())
203}
204
205pub async fn audit_secret_scan_active(
206    caller: &str,
207    content_len: usize,
208    findings: &[SecretFinding],
209) {
210    emit_secret_scan_log(caller, content_len, findings);
211
212    let Some(event_log) = active_event_log() else {
213        return;
214    };
215
216    if let Err(error) =
217        append_secret_scan_audit(event_log.as_ref(), caller, content_len, findings).await
218    {
219        crate::events::log_warn(
220            "secret_scan.audit",
221            &format!("failed to append secret scan audit event: {error}"),
222        );
223    }
224}
225
226pub(crate) fn register_secret_scan_builtins(vm: &mut Vm) {
227    vm.register_async_builtin("secret_scan", |args| async move {
228        let content = match args.first() {
229            Some(VmValue::Nil) | None => {
230                return Err(VmError::Runtime("secret_scan: content is required".into()));
231            }
232            Some(value) => value.display(),
233        };
234
235        let findings = scan_content(&content);
236        audit_secret_scan_active("stdlib.secret_scan", content.len(), &findings).await;
237
238        let value = serde_json::to_value(findings)
239            .map_err(|error| VmError::Runtime(format!("secret_scan: {error}")))?;
240        Ok(crate::schema::json_to_vm_value(&value))
241    });
242}
243
244fn build_finding(
245    content: &str,
246    line_starts: &[usize],
247    detector: &str,
248    source: &str,
249    title: &str,
250    start_offset: usize,
251    end_offset: usize,
252    matched: &str,
253) -> SecretFinding {
254    let (line, column_start) = offset_to_line_col(content, line_starts, start_offset);
255    let (_, column_end) = offset_to_line_col(content, line_starts, end_offset);
256    SecretFinding {
257        detector: detector.to_string(),
258        source: source.to_string(),
259        title: title.to_string(),
260        line,
261        column_start,
262        column_end,
263        start_offset,
264        end_offset,
265        redacted: redact_match(matched),
266        fingerprint: fingerprint(matched),
267    }
268}
269
270fn line_starts(content: &str) -> Vec<usize> {
271    let mut starts = vec![0];
272    for (index, byte) in content.bytes().enumerate() {
273        if byte == b'\n' {
274            starts.push(index + 1);
275        }
276    }
277    starts
278}
279
280fn offset_to_line_col(content: &str, line_starts: &[usize], offset: usize) -> (usize, usize) {
281    let line_index = line_starts
282        .partition_point(|start| *start <= offset)
283        .saturating_sub(1);
284    let line_start = line_starts[line_index];
285    let column = content[line_start..offset].chars().count() + 1;
286    (line_index + 1, column)
287}
288
289fn redact_match(matched: &str) -> String {
290    if matched.starts_with("-----BEGIN ") {
291        return format!(
292            "{} …",
293            matched
294                .lines()
295                .next()
296                .unwrap_or("-----BEGIN PRIVATE KEY-----")
297        );
298    }
299
300    let chars: Vec<char> = matched.chars().collect();
301    if chars.len() <= 8 {
302        return "*".repeat(chars.len());
303    }
304    let prefix: String = chars.iter().take(4).collect();
305    let suffix: String = chars[chars.len().saturating_sub(4)..].iter().collect();
306    format!("{prefix}…{suffix}")
307}
308
309fn fingerprint(matched: &str) -> String {
310    let hash = sha2::Sha256::digest(matched.as_bytes());
311    let hex: String = hash.iter().map(|byte| format!("{byte:02x}")).collect();
312    hex[..16].to_string()
313}
314
315fn shannon_entropy(value: &str) -> f64 {
316    let mut counts = BTreeMap::new();
317    for ch in value.chars() {
318        *counts.entry(ch).or_insert(0usize) += 1;
319    }
320    let len = value.chars().count() as f64;
321    counts
322        .values()
323        .map(|count| {
324            let probability = *count as f64 / len;
325            -(probability * probability.log2())
326        })
327        .sum()
328}
329
330fn emit_secret_scan_log(caller: &str, content_len: usize, findings: &[SecretFinding]) {
331    let metadata = serde_json::json!({
332        "topic": SECRET_SCAN_AUDIT_TOPIC,
333        "caller": caller,
334        "content_len": content_len,
335        "finding_count": findings.len(),
336        "clean": findings.is_empty(),
337        "findings": findings
338            .iter()
339            .map(|finding| serde_json::json!({
340                "detector": finding.detector,
341                "source": finding.source,
342                "line": finding.line,
343                "fingerprint": finding.fingerprint,
344                "redacted": finding.redacted,
345            }))
346            .collect::<Vec<_>>(),
347    });
348    let metadata = metadata
349        .as_object()
350        .cloned()
351        .map(|object| object.into_iter().collect::<BTreeMap<_, _>>())
352        .unwrap_or_default();
353    crate::events::log_info_meta("secret_scan.audit", "secret scan completed", metadata);
354}
355
356#[cfg(test)]
357mod tests {
358    use super::*;
359
360    use crate::event_log::{EventLog, MemoryEventLog};
361
362    #[test]
363    fn scan_content_detects_specific_rules_and_entropy_rule() {
364        let findings = scan_content(
365            r#"
366github_token = "ghp_1234567890abcdefghijklmnopqrstuvwxyzAB"
367config = { client_secret: "QWxhZGRpbjpPcGVuU2VzYW1lQWNjZXNzVG9rZW4=" }
368"#,
369        );
370
371        assert!(findings
372            .iter()
373            .any(|finding| finding.detector == "github-token"));
374        assert!(findings
375            .iter()
376            .any(|finding| finding.detector == "high-entropy-credential-assignment"));
377    }
378
379    #[test]
380    fn scan_content_redacts_private_key_blocks() {
381        let findings = scan_content(
382            "-----BEGIN OPENSSH PRIVATE KEY-----\nZXhhbXBsZQ==\n-----END OPENSSH PRIVATE KEY-----\n",
383        );
384        assert_eq!(findings.len(), 1);
385        assert_eq!(findings[0].detector, "private-key-block");
386        assert_eq!(
387            findings[0].redacted,
388            "-----BEGIN OPENSSH PRIVATE KEY----- …"
389        );
390    }
391
392    #[tokio::test(flavor = "current_thread")]
393    async fn append_secret_scan_audit_writes_redacted_event() {
394        let log = MemoryEventLog::new(32);
395        let findings = scan_content(r#"token = "sk-abcdefghijklmnopqrstuvwx123456""#);
396        append_secret_scan_audit(&log, "test.secret_scan", 44, &findings)
397            .await
398            .unwrap();
399
400        let topic = Topic::new(SECRET_SCAN_AUDIT_TOPIC).unwrap();
401        let events = log.read_range(&topic, None, 10).await.unwrap();
402        assert_eq!(events.len(), 1);
403        assert_eq!(events[0].1.kind, "scan_detected");
404        assert_eq!(events[0].1.payload["caller"], "test.secret_scan");
405        let redacted = events[0].1.payload["findings"][0]["redacted"]
406            .as_str()
407            .unwrap();
408        assert!(redacted.contains('…'));
409        assert!(!redacted.contains("abcdefghijklmnopqrstuvwx123456"));
410    }
411}