1use std::collections::{BTreeMap, BTreeSet};
2use std::sync::LazyLock;
3
4use regex::Regex;
5use serde::{Deserialize, Serialize};
6use sha2::Digest;
7
8use crate::event_log::{active_event_log, EventLog, LogEvent, Topic};
9use crate::secret_patterns::{SecretPatternSpec, DEFAULT_SECRET_PATTERN_SPECS};
10use crate::value::{VmError, VmValue};
11use crate::vm::Vm;
12
13pub const SECRET_SCAN_AUDIT_TOPIC: &str = "audit.secret_scan";
14const HIGH_ENTROPY_THRESHOLD: f64 = 3.5;
15
16#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
17pub struct SecretFinding {
18 pub detector: String,
19 pub source: String,
20 pub title: String,
21 pub line: usize,
22 pub column_start: usize,
23 pub column_end: usize,
24 pub start_offset: usize,
25 pub end_offset: usize,
26 pub redacted: String,
27 pub fingerprint: String,
28}
29
30struct SecretRule {
31 spec: &'static SecretPatternSpec,
32 regex: Regex,
33}
34
35static SECRET_RULES: LazyLock<Vec<SecretRule>> = LazyLock::new(|| {
36 DEFAULT_SECRET_PATTERN_SPECS
37 .iter()
38 .map(|spec| SecretRule {
39 spec,
40 regex: Regex::new(spec.regex).unwrap_or_else(|error| {
41 panic!("invalid {} secret scan regex: {error}", spec.detector)
42 }),
43 })
44 .collect()
45});
46
47static HIGH_ENTROPY_ASSIGNMENT_RULE: LazyLock<Regex> = LazyLock::new(|| {
48 Regex::new(
49 r#"(?im)(?:secret|token|api[_-]?key|access[_-]?key|password|passwd|pwd|client[_-]?secret|private[_-]?key)[^\n:=]{0,32}(?::|=)\s*["']([A-Za-z0-9+/=_\.-]{20,})["']"#,
50 )
51 .unwrap()
52});
53
54pub fn scan_content(content: &str) -> Vec<SecretFinding> {
55 let line_starts = line_starts(content);
56 let mut findings = Vec::new();
57
58 for rule in SECRET_RULES.iter() {
59 for mat in rule.regex.find_iter(content) {
60 findings.push(build_finding(
61 content,
62 &line_starts,
63 rule.spec.detector,
64 rule.spec.source,
65 rule.spec.title,
66 mat.start(),
67 mat.end(),
68 mat.as_str(),
69 ));
70 }
71 }
72
73 for captures in HIGH_ENTROPY_ASSIGNMENT_RULE.captures_iter(content) {
74 let Some(secret) = captures.get(1) else {
75 continue;
76 };
77 if shannon_entropy(secret.as_str()) < HIGH_ENTROPY_THRESHOLD {
78 continue;
79 }
80 findings.push(build_finding(
81 content,
82 &line_starts,
83 "high-entropy-credential-assignment",
84 "trufflehog",
85 "High-entropy secret assignment",
86 secret.start(),
87 secret.end(),
88 secret.as_str(),
89 ));
90 }
91
92 findings.sort_by(|left, right| {
93 left.start_offset
94 .cmp(&right.start_offset)
95 .then(left.end_offset.cmp(&right.end_offset))
96 .then(left.detector.cmp(&right.detector))
97 });
98 let specific_spans: BTreeSet<(usize, usize)> = findings
99 .iter()
100 .filter(|finding| finding.detector != "high-entropy-credential-assignment")
101 .map(|finding| (finding.start_offset, finding.end_offset))
102 .collect();
103 findings.retain(|finding| {
104 finding.detector != "high-entropy-credential-assignment"
105 || !specific_spans.contains(&(finding.start_offset, finding.end_offset))
106 });
107 findings.dedup_by(|left, right| {
108 left.detector == right.detector
109 && left.start_offset == right.start_offset
110 && left.end_offset == right.end_offset
111 });
112 findings
113}
114
115pub async fn append_secret_scan_audit<L: EventLog + ?Sized>(
116 event_log: &L,
117 caller: &str,
118 content_len: usize,
119 findings: &[SecretFinding],
120) -> Result<(), crate::event_log::LogError> {
121 let payload = serde_json::json!({
122 "caller": caller,
123 "content_len": content_len,
124 "finding_count": findings.len(),
125 "clean": findings.is_empty(),
126 "findings": findings
127 .iter()
128 .map(|finding| {
129 serde_json::json!({
130 "detector": finding.detector,
131 "source": finding.source,
132 "title": finding.title,
133 "line": finding.line,
134 "column_start": finding.column_start,
135 "column_end": finding.column_end,
136 "start_offset": finding.start_offset,
137 "end_offset": finding.end_offset,
138 "fingerprint": finding.fingerprint,
139 "redacted": finding.redacted,
140 })
141 })
142 .collect::<Vec<_>>(),
143 "observed_at": crate::orchestration::now_rfc3339(),
144 });
145 let topic = Topic::new(SECRET_SCAN_AUDIT_TOPIC).expect("secret scan audit topic is valid");
146 let kind = if findings.is_empty() {
147 "scan_clean"
148 } else {
149 "scan_detected"
150 };
151 event_log
152 .append(&topic, LogEvent::new(kind, payload))
153 .await?;
154 Ok(())
155}
156
157pub async fn audit_secret_scan_active(
158 caller: &str,
159 content_len: usize,
160 findings: &[SecretFinding],
161) {
162 emit_secret_scan_log(caller, content_len, findings);
163
164 let Some(event_log) = active_event_log() else {
165 return;
166 };
167
168 if let Err(error) =
169 append_secret_scan_audit(event_log.as_ref(), caller, content_len, findings).await
170 {
171 crate::events::log_warn(
172 "secret_scan.audit",
173 &format!("failed to append secret scan audit event: {error}"),
174 );
175 }
176}
177
178pub(crate) fn register_secret_scan_builtins(vm: &mut Vm) {
179 vm.register_async_builtin("secret_scan", |args| async move {
180 let content = match args.first() {
181 Some(VmValue::Nil) | None => {
182 return Err(VmError::Runtime("secret_scan: content is required".into()));
183 }
184 Some(value) => value.display(),
185 };
186
187 let findings = scan_content(&content);
188 audit_secret_scan_active("stdlib.secret_scan", content.len(), &findings).await;
189
190 let value = serde_json::to_value(findings)
191 .map_err(|error| VmError::Runtime(format!("secret_scan: {error}")))?;
192 Ok(crate::schema::json_to_vm_value(&value))
193 });
194}
195
196fn build_finding(
197 content: &str,
198 line_starts: &[usize],
199 detector: &str,
200 source: &str,
201 title: &str,
202 start_offset: usize,
203 end_offset: usize,
204 matched: &str,
205) -> SecretFinding {
206 let (line, column_start) = offset_to_line_col(content, line_starts, start_offset);
207 let (_, column_end) = offset_to_line_col(content, line_starts, end_offset);
208 SecretFinding {
209 detector: detector.to_string(),
210 source: source.to_string(),
211 title: title.to_string(),
212 line,
213 column_start,
214 column_end,
215 start_offset,
216 end_offset,
217 redacted: redact_match(matched),
218 fingerprint: fingerprint(matched),
219 }
220}
221
222fn line_starts(content: &str) -> Vec<usize> {
223 let mut starts = vec![0];
224 for (index, byte) in content.bytes().enumerate() {
225 if byte == b'\n' {
226 starts.push(index + 1);
227 }
228 }
229 starts
230}
231
232fn offset_to_line_col(content: &str, line_starts: &[usize], offset: usize) -> (usize, usize) {
233 let line_index = line_starts
234 .partition_point(|start| *start <= offset)
235 .saturating_sub(1);
236 let line_start = line_starts[line_index];
237 let column = content[line_start..offset].chars().count() + 1;
238 (line_index + 1, column)
239}
240
241fn redact_match(matched: &str) -> String {
242 if matched.starts_with("-----BEGIN ") {
243 return format!(
244 "{} …",
245 matched
246 .lines()
247 .next()
248 .unwrap_or("-----BEGIN PRIVATE KEY-----")
249 );
250 }
251
252 let chars: Vec<char> = matched.chars().collect();
253 if chars.len() <= 8 {
254 return "*".repeat(chars.len());
255 }
256 let prefix: String = chars.iter().take(4).collect();
257 let suffix: String = chars[chars.len().saturating_sub(4)..].iter().collect();
258 format!("{prefix}…{suffix}")
259}
260
261fn fingerprint(matched: &str) -> String {
262 let hash = sha2::Sha256::digest(matched.as_bytes());
263 let hex: String = hash.iter().map(|byte| format!("{byte:02x}")).collect();
264 hex[..16].to_string()
265}
266
267fn shannon_entropy(value: &str) -> f64 {
268 let mut counts = BTreeMap::new();
269 for ch in value.chars() {
270 *counts.entry(ch).or_insert(0usize) += 1;
271 }
272 let len = value.chars().count() as f64;
273 counts
274 .values()
275 .map(|count| {
276 let probability = *count as f64 / len;
277 -(probability * probability.log2())
278 })
279 .sum()
280}
281
282fn emit_secret_scan_log(caller: &str, content_len: usize, findings: &[SecretFinding]) {
283 let metadata = serde_json::json!({
284 "topic": SECRET_SCAN_AUDIT_TOPIC,
285 "caller": caller,
286 "content_len": content_len,
287 "finding_count": findings.len(),
288 "clean": findings.is_empty(),
289 "findings": findings
290 .iter()
291 .map(|finding| serde_json::json!({
292 "detector": finding.detector,
293 "source": finding.source,
294 "line": finding.line,
295 "fingerprint": finding.fingerprint,
296 "redacted": finding.redacted,
297 }))
298 .collect::<Vec<_>>(),
299 });
300 let metadata = metadata
301 .as_object()
302 .cloned()
303 .map(|object| object.into_iter().collect::<BTreeMap<_, _>>())
304 .unwrap_or_default();
305 crate::events::log_info_meta("secret_scan.audit", "secret scan completed", metadata);
306}
307
308#[cfg(test)]
309mod tests {
310 use super::*;
311
312 use crate::event_log::{EventLog, MemoryEventLog};
313
314 #[test]
315 fn scan_content_detects_specific_rules_and_entropy_rule() {
316 let findings = scan_content(
317 r#"
318github_token = "ghp_1234567890abcdefghijklmnopqrstuvwxyzAB"
319config = { client_secret: "QWxhZGRpbjpPcGVuU2VzYW1lQWNjZXNzVG9rZW4=" }
320"#,
321 );
322
323 assert!(findings
324 .iter()
325 .any(|finding| finding.detector == "github-token"));
326 assert!(findings
327 .iter()
328 .any(|finding| finding.detector == "high-entropy-credential-assignment"));
329 }
330
331 #[test]
332 fn scan_content_redacts_private_key_blocks() {
333 let findings = scan_content(
334 "-----BEGIN OPENSSH PRIVATE KEY-----\nZXhhbXBsZQ==\n-----END OPENSSH PRIVATE KEY-----\n",
335 );
336 assert_eq!(findings.len(), 1);
337 assert_eq!(findings[0].detector, "private-key-block");
338 assert_eq!(
339 findings[0].end_offset - findings[0].start_offset,
340 "-----BEGIN OPENSSH PRIVATE KEY-----\nZXhhbXBsZQ==\n-----END OPENSSH PRIVATE KEY-----"
341 .len()
342 );
343 assert_eq!(
344 findings[0].redacted,
345 "-----BEGIN OPENSSH PRIVATE KEY----- …"
346 );
347 }
348
349 #[test]
350 fn scan_content_covers_redaction_only_token_shapes() {
351 let findings = scan_content(
352 "Authorization: Bearer abcDEFghi123_-+/=xyz\njwt=eyJabcd.eyJefgh.signature_pad\n",
353 );
354 let detectors = findings
355 .iter()
356 .map(|finding| finding.detector.as_str())
357 .collect::<BTreeSet<_>>();
358 assert!(detectors.contains("bearer-token"));
359 assert!(detectors.contains("jwt-token"));
360 }
361
362 #[tokio::test(flavor = "current_thread")]
363 async fn append_secret_scan_audit_writes_redacted_event() {
364 let log = MemoryEventLog::new(32);
365 let findings = scan_content(r#"token = "sk-abcdefghijklmnopqrstuvwx123456""#);
366 append_secret_scan_audit(&log, "test.secret_scan", 44, &findings)
367 .await
368 .unwrap();
369
370 let topic = Topic::new(SECRET_SCAN_AUDIT_TOPIC).unwrap();
371 let events = log.read_range(&topic, None, 10).await.unwrap();
372 assert_eq!(events.len(), 1);
373 assert_eq!(events[0].1.kind, "scan_detected");
374 assert_eq!(events[0].1.payload["caller"], "test.secret_scan");
375 let redacted = events[0].1.payload["findings"][0]["redacted"]
376 .as_str()
377 .unwrap();
378 assert!(redacted.contains('…'));
379 assert!(!redacted.contains("abcdefghijklmnopqrstuvwx123456"));
380 }
381}