1use std::collections::{BTreeMap, BTreeSet};
2use std::sync::LazyLock;
3
4use regex::Regex;
5use serde::{Deserialize, Serialize};
6use sha2::Digest;
7
8use crate::event_log::{active_event_log, EventLog, LogEvent, Topic};
9use crate::value::{VmError, VmValue};
10use crate::vm::Vm;
11
12pub const SECRET_SCAN_AUDIT_TOPIC: &str = "audit.secret_scan";
13const HIGH_ENTROPY_THRESHOLD: f64 = 3.5;
14
15#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
16pub struct SecretFinding {
17 pub detector: String,
18 pub source: String,
19 pub title: String,
20 pub line: usize,
21 pub column_start: usize,
22 pub column_end: usize,
23 pub start_offset: usize,
24 pub end_offset: usize,
25 pub redacted: String,
26 pub fingerprint: String,
27}
28
29struct SecretRule {
30 detector: &'static str,
31 source: &'static str,
32 title: &'static str,
33 regex: Regex,
34}
35
36static SECRET_RULES: LazyLock<Vec<SecretRule>> = LazyLock::new(|| {
37 vec![
38 SecretRule {
39 detector: "aws-access-key-id",
40 source: "gitleaks",
41 title: "AWS access key id",
42 regex: Regex::new(r"\b(?:AKIA|ASIA|AGPA|AIDA|ANPA|AROA|AIPA)[A-Z0-9]{16}\b").unwrap(),
43 },
44 SecretRule {
45 detector: "github-token",
46 source: "gitleaks",
47 title: "GitHub token",
48 regex: Regex::new(r"\bgh(?:p|o|u|s|r)_[A-Za-z0-9]{36,255}\b").unwrap(),
49 },
50 SecretRule {
51 detector: "github-fine-grained-token",
52 source: "gitleaks",
53 title: "GitHub fine-grained personal access token",
54 regex: Regex::new(r"\bgithub_pat_[A-Za-z0-9_]{20,255}\b").unwrap(),
55 },
56 SecretRule {
57 detector: "gitlab-token",
58 source: "detect-secrets",
59 title: "GitLab personal access token",
60 regex: Regex::new(r"\bglpat-[A-Za-z0-9_-]{20,255}\b").unwrap(),
61 },
62 SecretRule {
63 detector: "npm-token",
64 source: "detect-secrets",
65 title: "npm access token",
66 regex: Regex::new(r"\bnpm_[A-Za-z0-9]{36}\b").unwrap(),
67 },
68 SecretRule {
69 detector: "openai-api-key",
70 source: "detect-secrets",
71 title: "OpenAI API key",
72 regex: Regex::new(r"\bsk-[A-Za-z0-9_-]{20,255}\b").unwrap(),
73 },
74 SecretRule {
75 detector: "slack-token",
76 source: "trufflehog",
77 title: "Slack token",
78 regex: Regex::new(r"\bxox(?:a|b|p|r|s)-[A-Za-z0-9-]{10,255}\b").unwrap(),
79 },
80 SecretRule {
81 detector: "stripe-secret-key",
82 source: "trufflehog",
83 title: "Stripe secret or restricted key",
84 regex: Regex::new(r"\b(?:rk|sk)_(?:live|test)_[0-9A-Za-z]{16,255}\b").unwrap(),
85 },
86 SecretRule {
87 detector: "private-key-block",
88 source: "detect-secrets",
89 title: "Private key block",
90 regex: Regex::new(r"(?m)^-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----$").unwrap(),
91 },
92 ]
93});
94
95static HIGH_ENTROPY_ASSIGNMENT_RULE: LazyLock<Regex> = LazyLock::new(|| {
96 Regex::new(
97 r#"(?im)(?:secret|token|api[_-]?key|access[_-]?key|password|passwd|pwd|client[_-]?secret|private[_-]?key)[^\n:=]{0,32}(?::|=)\s*["']([A-Za-z0-9+/=_\.-]{20,})["']"#,
98 )
99 .unwrap()
100});
101
102pub fn scan_content(content: &str) -> Vec<SecretFinding> {
103 let line_starts = line_starts(content);
104 let mut findings = Vec::new();
105
106 for rule in SECRET_RULES.iter() {
107 for mat in rule.regex.find_iter(content) {
108 findings.push(build_finding(
109 content,
110 &line_starts,
111 rule.detector,
112 rule.source,
113 rule.title,
114 mat.start(),
115 mat.end(),
116 mat.as_str(),
117 ));
118 }
119 }
120
121 for captures in HIGH_ENTROPY_ASSIGNMENT_RULE.captures_iter(content) {
122 let Some(secret) = captures.get(1) else {
123 continue;
124 };
125 if shannon_entropy(secret.as_str()) < HIGH_ENTROPY_THRESHOLD {
126 continue;
127 }
128 findings.push(build_finding(
129 content,
130 &line_starts,
131 "high-entropy-credential-assignment",
132 "trufflehog",
133 "High-entropy secret assignment",
134 secret.start(),
135 secret.end(),
136 secret.as_str(),
137 ));
138 }
139
140 findings.sort_by(|left, right| {
141 left.start_offset
142 .cmp(&right.start_offset)
143 .then(left.end_offset.cmp(&right.end_offset))
144 .then(left.detector.cmp(&right.detector))
145 });
146 let specific_spans: BTreeSet<(usize, usize)> = findings
147 .iter()
148 .filter(|finding| finding.detector != "high-entropy-credential-assignment")
149 .map(|finding| (finding.start_offset, finding.end_offset))
150 .collect();
151 findings.retain(|finding| {
152 finding.detector != "high-entropy-credential-assignment"
153 || !specific_spans.contains(&(finding.start_offset, finding.end_offset))
154 });
155 findings.dedup_by(|left, right| {
156 left.detector == right.detector
157 && left.start_offset == right.start_offset
158 && left.end_offset == right.end_offset
159 });
160 findings
161}
162
163pub async fn append_secret_scan_audit<L: EventLog + ?Sized>(
164 event_log: &L,
165 caller: &str,
166 content_len: usize,
167 findings: &[SecretFinding],
168) -> Result<(), crate::event_log::LogError> {
169 let payload = serde_json::json!({
170 "caller": caller,
171 "content_len": content_len,
172 "finding_count": findings.len(),
173 "clean": findings.is_empty(),
174 "findings": findings
175 .iter()
176 .map(|finding| {
177 serde_json::json!({
178 "detector": finding.detector,
179 "source": finding.source,
180 "title": finding.title,
181 "line": finding.line,
182 "column_start": finding.column_start,
183 "column_end": finding.column_end,
184 "start_offset": finding.start_offset,
185 "end_offset": finding.end_offset,
186 "fingerprint": finding.fingerprint,
187 "redacted": finding.redacted,
188 })
189 })
190 .collect::<Vec<_>>(),
191 "observed_at": crate::orchestration::now_rfc3339(),
192 });
193 let topic = Topic::new(SECRET_SCAN_AUDIT_TOPIC).expect("secret scan audit topic is valid");
194 let kind = if findings.is_empty() {
195 "scan_clean"
196 } else {
197 "scan_detected"
198 };
199 event_log
200 .append(&topic, LogEvent::new(kind, payload))
201 .await?;
202 Ok(())
203}
204
205pub async fn audit_secret_scan_active(
206 caller: &str,
207 content_len: usize,
208 findings: &[SecretFinding],
209) {
210 emit_secret_scan_log(caller, content_len, findings);
211
212 let Some(event_log) = active_event_log() else {
213 return;
214 };
215
216 if let Err(error) =
217 append_secret_scan_audit(event_log.as_ref(), caller, content_len, findings).await
218 {
219 crate::events::log_warn(
220 "secret_scan.audit",
221 &format!("failed to append secret scan audit event: {error}"),
222 );
223 }
224}
225
226pub(crate) fn register_secret_scan_builtins(vm: &mut Vm) {
227 vm.register_async_builtin("secret_scan", |args| async move {
228 let content = match args.first() {
229 Some(VmValue::Nil) | None => {
230 return Err(VmError::Runtime("secret_scan: content is required".into()));
231 }
232 Some(value) => value.display(),
233 };
234
235 let findings = scan_content(&content);
236 audit_secret_scan_active("stdlib.secret_scan", content.len(), &findings).await;
237
238 let value = serde_json::to_value(findings)
239 .map_err(|error| VmError::Runtime(format!("secret_scan: {error}")))?;
240 Ok(crate::schema::json_to_vm_value(&value))
241 });
242}
243
244fn build_finding(
245 content: &str,
246 line_starts: &[usize],
247 detector: &str,
248 source: &str,
249 title: &str,
250 start_offset: usize,
251 end_offset: usize,
252 matched: &str,
253) -> SecretFinding {
254 let (line, column_start) = offset_to_line_col(content, line_starts, start_offset);
255 let (_, column_end) = offset_to_line_col(content, line_starts, end_offset);
256 SecretFinding {
257 detector: detector.to_string(),
258 source: source.to_string(),
259 title: title.to_string(),
260 line,
261 column_start,
262 column_end,
263 start_offset,
264 end_offset,
265 redacted: redact_match(matched),
266 fingerprint: fingerprint(matched),
267 }
268}
269
270fn line_starts(content: &str) -> Vec<usize> {
271 let mut starts = vec![0];
272 for (index, byte) in content.bytes().enumerate() {
273 if byte == b'\n' {
274 starts.push(index + 1);
275 }
276 }
277 starts
278}
279
280fn offset_to_line_col(content: &str, line_starts: &[usize], offset: usize) -> (usize, usize) {
281 let line_index = line_starts
282 .partition_point(|start| *start <= offset)
283 .saturating_sub(1);
284 let line_start = line_starts[line_index];
285 let column = content[line_start..offset].chars().count() + 1;
286 (line_index + 1, column)
287}
288
289fn redact_match(matched: &str) -> String {
290 if matched.starts_with("-----BEGIN ") {
291 return format!(
292 "{} …",
293 matched
294 .lines()
295 .next()
296 .unwrap_or("-----BEGIN PRIVATE KEY-----")
297 );
298 }
299
300 let chars: Vec<char> = matched.chars().collect();
301 if chars.len() <= 8 {
302 return "*".repeat(chars.len());
303 }
304 let prefix: String = chars.iter().take(4).collect();
305 let suffix: String = chars[chars.len().saturating_sub(4)..].iter().collect();
306 format!("{prefix}…{suffix}")
307}
308
309fn fingerprint(matched: &str) -> String {
310 let hash = sha2::Sha256::digest(matched.as_bytes());
311 let hex: String = hash.iter().map(|byte| format!("{byte:02x}")).collect();
312 hex[..16].to_string()
313}
314
315fn shannon_entropy(value: &str) -> f64 {
316 let mut counts = BTreeMap::new();
317 for ch in value.chars() {
318 *counts.entry(ch).or_insert(0usize) += 1;
319 }
320 let len = value.chars().count() as f64;
321 counts
322 .values()
323 .map(|count| {
324 let probability = *count as f64 / len;
325 -(probability * probability.log2())
326 })
327 .sum()
328}
329
330fn emit_secret_scan_log(caller: &str, content_len: usize, findings: &[SecretFinding]) {
331 let metadata = serde_json::json!({
332 "topic": SECRET_SCAN_AUDIT_TOPIC,
333 "caller": caller,
334 "content_len": content_len,
335 "finding_count": findings.len(),
336 "clean": findings.is_empty(),
337 "findings": findings
338 .iter()
339 .map(|finding| serde_json::json!({
340 "detector": finding.detector,
341 "source": finding.source,
342 "line": finding.line,
343 "fingerprint": finding.fingerprint,
344 "redacted": finding.redacted,
345 }))
346 .collect::<Vec<_>>(),
347 });
348 let metadata = metadata
349 .as_object()
350 .cloned()
351 .map(|object| object.into_iter().collect::<BTreeMap<_, _>>())
352 .unwrap_or_default();
353 crate::events::log_info_meta("secret_scan.audit", "secret scan completed", metadata);
354}
355
356#[cfg(test)]
357mod tests {
358 use super::*;
359
360 use crate::event_log::{EventLog, MemoryEventLog};
361
362 #[test]
363 fn scan_content_detects_specific_rules_and_entropy_rule() {
364 let findings = scan_content(
365 r#"
366github_token = "ghp_1234567890abcdefghijklmnopqrstuvwxyzAB"
367config = { client_secret: "QWxhZGRpbjpPcGVuU2VzYW1lQWNjZXNzVG9rZW4=" }
368"#,
369 );
370
371 assert!(findings
372 .iter()
373 .any(|finding| finding.detector == "github-token"));
374 assert!(findings
375 .iter()
376 .any(|finding| finding.detector == "high-entropy-credential-assignment"));
377 }
378
379 #[test]
380 fn scan_content_redacts_private_key_blocks() {
381 let findings = scan_content(
382 "-----BEGIN OPENSSH PRIVATE KEY-----\nZXhhbXBsZQ==\n-----END OPENSSH PRIVATE KEY-----\n",
383 );
384 assert_eq!(findings.len(), 1);
385 assert_eq!(findings[0].detector, "private-key-block");
386 assert_eq!(
387 findings[0].redacted,
388 "-----BEGIN OPENSSH PRIVATE KEY----- …"
389 );
390 }
391
392 #[tokio::test(flavor = "current_thread")]
393 async fn append_secret_scan_audit_writes_redacted_event() {
394 let log = MemoryEventLog::new(32);
395 let findings = scan_content(r#"token = "sk-abcdefghijklmnopqrstuvwx123456""#);
396 append_secret_scan_audit(&log, "test.secret_scan", 44, &findings)
397 .await
398 .unwrap();
399
400 let topic = Topic::new(SECRET_SCAN_AUDIT_TOPIC).unwrap();
401 let events = log.read_range(&topic, None, 10).await.unwrap();
402 assert_eq!(events.len(), 1);
403 assert_eq!(events[0].1.kind, "scan_detected");
404 assert_eq!(events[0].1.payload["caller"], "test.secret_scan");
405 let redacted = events[0].1.payload["findings"][0]["redacted"]
406 .as_str()
407 .unwrap();
408 assert!(redacted.contains('…'));
409 assert!(!redacted.contains("abcdefghijklmnopqrstuvwx123456"));
410 }
411}