1use std::collections::BTreeMap;
2use std::sync::LazyLock;
3
4use regex::Regex;
5use serde::{Deserialize, Serialize};
6use sha2::Digest;
7
8use crate::event_log::{active_event_log, EventLog, LogEvent, Topic};
9use crate::secret_patterns::{SecretPatternSpec, DEFAULT_SECRET_PATTERN_SPECS};
10use crate::value::{VmError, VmValue};
11use crate::vm::Vm;
12
13pub const SECRET_SCAN_AUDIT_TOPIC: &str = "audit.secret_scan";
14const HIGH_ENTROPY_THRESHOLD: f64 = 3.5;
15
16#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
17pub struct SecretFinding {
18 pub detector: String,
19 pub source: String,
20 pub title: String,
21 pub line: usize,
22 pub column_start: usize,
23 pub column_end: usize,
24 pub start_offset: usize,
25 pub end_offset: usize,
26 pub redacted: String,
27 pub fingerprint: String,
28}
29
30struct SecretRule {
31 spec: &'static SecretPatternSpec,
32 regex: Regex,
33}
34
35static SECRET_RULES: LazyLock<Vec<SecretRule>> = LazyLock::new(|| {
36 DEFAULT_SECRET_PATTERN_SPECS
37 .iter()
38 .map(|spec| SecretRule {
39 spec,
40 regex: Regex::new(spec.regex).unwrap_or_else(|error| {
41 panic!("invalid {} secret scan regex: {error}", spec.detector)
42 }),
43 })
44 .collect()
45});
46
47static HIGH_ENTROPY_ASSIGNMENT_RULE: LazyLock<Regex> = LazyLock::new(|| {
48 Regex::new(
49 r#"(?im)(?:secret|token|api[_-]?key|access[_-]?key|password|passwd|pwd|client[_-]?secret|private[_-]?key)[^\n:=]{0,32}(?::|=)\s*["']([A-Za-z0-9+/=_\.-]{20,})["']"#,
50 )
51 .unwrap()
52});
53
54pub fn scan_content(content: &str) -> Vec<SecretFinding> {
55 let line_starts = line_starts(content);
56 let mut findings = Vec::new();
57
58 for rule in SECRET_RULES.iter() {
59 for mat in rule.regex.find_iter(content) {
60 findings.push(build_finding(
61 content,
62 &line_starts,
63 rule.spec.detector,
64 rule.spec.source,
65 rule.spec.title,
66 mat.start(),
67 mat.end(),
68 mat.as_str(),
69 ));
70 }
71 }
72
73 for captures in HIGH_ENTROPY_ASSIGNMENT_RULE.captures_iter(content) {
74 let Some(secret) = captures.get(1) else {
75 continue;
76 };
77 if shannon_entropy(secret.as_str()) < HIGH_ENTROPY_THRESHOLD {
78 continue;
79 }
80 findings.push(build_finding(
81 content,
82 &line_starts,
83 "high-entropy-credential-assignment",
84 "trufflehog",
85 "High-entropy secret assignment",
86 secret.start(),
87 secret.end(),
88 secret.as_str(),
89 ));
90 }
91
92 findings.sort_by(|left, right| {
93 left.start_offset
94 .cmp(&right.start_offset)
95 .then(left.end_offset.cmp(&right.end_offset))
96 .then(left.detector.cmp(&right.detector))
97 });
98 let higher_specificity_spans = findings
99 .iter()
100 .map(|finding| {
101 (
102 finding.start_offset,
103 finding.end_offset,
104 detector_specificity(&finding.detector),
105 )
106 })
107 .collect::<Vec<_>>();
108 findings.retain(|finding| {
109 let specificity = detector_specificity(&finding.detector);
110 !higher_specificity_spans
111 .iter()
112 .any(|(start, end, other_specificity)| {
113 *other_specificity > specificity
114 && spans_overlap((finding.start_offset, finding.end_offset), (*start, *end))
115 })
116 });
117 findings.dedup_by(|left, right| {
118 left.detector == right.detector
119 && left.start_offset == right.start_offset
120 && left.end_offset == right.end_offset
121 });
122 findings
123}
124
125fn detector_specificity(detector: &str) -> u8 {
126 match detector {
127 "sensitive-assignment" => 0,
128 "high-entropy-credential-assignment" => 1,
129 _ => 2,
130 }
131}
132
133fn spans_overlap(left: (usize, usize), right: (usize, usize)) -> bool {
134 left.0 < right.1 && right.0 < left.1
135}
136
137pub async fn append_secret_scan_audit<L: EventLog + ?Sized>(
138 event_log: &L,
139 caller: &str,
140 content_len: usize,
141 findings: &[SecretFinding],
142) -> Result<(), crate::event_log::LogError> {
143 let payload = serde_json::json!({
144 "caller": caller,
145 "content_len": content_len,
146 "finding_count": findings.len(),
147 "clean": findings.is_empty(),
148 "findings": findings
149 .iter()
150 .map(|finding| {
151 serde_json::json!({
152 "detector": finding.detector,
153 "source": finding.source,
154 "title": finding.title,
155 "line": finding.line,
156 "column_start": finding.column_start,
157 "column_end": finding.column_end,
158 "start_offset": finding.start_offset,
159 "end_offset": finding.end_offset,
160 "fingerprint": finding.fingerprint,
161 "redacted": finding.redacted,
162 })
163 })
164 .collect::<Vec<_>>(),
165 "observed_at": crate::orchestration::now_rfc3339(),
166 });
167 let topic = Topic::new(SECRET_SCAN_AUDIT_TOPIC).expect("secret scan audit topic is valid");
168 let kind = if findings.is_empty() {
169 "scan_clean"
170 } else {
171 "scan_detected"
172 };
173 event_log
174 .append(&topic, LogEvent::new(kind, payload))
175 .await?;
176 Ok(())
177}
178
179pub async fn audit_secret_scan_active(
180 caller: &str,
181 content_len: usize,
182 findings: &[SecretFinding],
183) {
184 emit_secret_scan_log(caller, content_len, findings);
185
186 let Some(event_log) = active_event_log() else {
187 return;
188 };
189
190 if let Err(error) =
191 append_secret_scan_audit(event_log.as_ref(), caller, content_len, findings).await
192 {
193 crate::events::log_warn(
194 "secret_scan.audit",
195 &format!("failed to append secret scan audit event: {error}"),
196 );
197 }
198}
199
200pub(crate) fn register_secret_scan_builtins(vm: &mut Vm) {
201 vm.register_async_builtin("secret_scan", |_ctx, args| async move {
202 let content = match args.first() {
203 Some(VmValue::Nil) | None => {
204 return Err(VmError::Runtime("secret_scan: content is required".into()));
205 }
206 Some(value) => value.display(),
207 };
208
209 let findings = scan_content(&content);
210 audit_secret_scan_active("stdlib.secret_scan", content.len(), &findings).await;
211
212 let value = serde_json::to_value(findings)
213 .map_err(|error| VmError::Runtime(format!("secret_scan: {error}")))?;
214 Ok(crate::schema::json_to_vm_value(&value))
215 });
216}
217
218fn build_finding(
219 content: &str,
220 line_starts: &[usize],
221 detector: &str,
222 source: &str,
223 title: &str,
224 start_offset: usize,
225 end_offset: usize,
226 matched: &str,
227) -> SecretFinding {
228 let (line, column_start) = offset_to_line_col(content, line_starts, start_offset);
229 let (_, column_end) = offset_to_line_col(content, line_starts, end_offset);
230 SecretFinding {
231 detector: detector.to_string(),
232 source: source.to_string(),
233 title: title.to_string(),
234 line,
235 column_start,
236 column_end,
237 start_offset,
238 end_offset,
239 redacted: redact_match(matched),
240 fingerprint: fingerprint(matched),
241 }
242}
243
244fn line_starts(content: &str) -> Vec<usize> {
245 let mut starts = vec![0];
246 for (index, byte) in content.bytes().enumerate() {
247 if byte == b'\n' {
248 starts.push(index + 1);
249 }
250 }
251 starts
252}
253
254fn offset_to_line_col(content: &str, line_starts: &[usize], offset: usize) -> (usize, usize) {
255 let line_index = line_starts
256 .partition_point(|start| *start <= offset)
257 .saturating_sub(1);
258 let line_start = line_starts[line_index];
259 let column = content[line_start..offset].chars().count() + 1;
260 (line_index + 1, column)
261}
262
263fn redact_match(matched: &str) -> String {
264 if matched.starts_with("-----BEGIN ") {
265 return format!(
266 "{} …",
267 matched
268 .lines()
269 .next()
270 .unwrap_or("-----BEGIN PRIVATE KEY-----")
271 );
272 }
273
274 let chars: Vec<char> = matched.chars().collect();
275 if chars.len() <= 8 {
276 return "*".repeat(chars.len());
277 }
278 let prefix: String = chars.iter().take(4).collect();
279 let suffix: String = chars[chars.len().saturating_sub(4)..].iter().collect();
280 format!("{prefix}…{suffix}")
281}
282
283fn fingerprint(matched: &str) -> String {
284 let hash = sha2::Sha256::digest(matched.as_bytes());
285 let hex: String = hash.iter().map(|byte| format!("{byte:02x}")).collect();
286 hex[..16].to_string()
287}
288
289fn shannon_entropy(value: &str) -> f64 {
290 let mut counts = BTreeMap::new();
291 for ch in value.chars() {
292 *counts.entry(ch).or_insert(0usize) += 1;
293 }
294 let len = value.chars().count() as f64;
295 counts
296 .values()
297 .map(|count| {
298 let probability = *count as f64 / len;
299 -(probability * probability.log2())
300 })
301 .sum()
302}
303
304fn emit_secret_scan_log(caller: &str, content_len: usize, findings: &[SecretFinding]) {
305 let metadata = serde_json::json!({
306 "topic": SECRET_SCAN_AUDIT_TOPIC,
307 "caller": caller,
308 "content_len": content_len,
309 "finding_count": findings.len(),
310 "clean": findings.is_empty(),
311 "findings": findings
312 .iter()
313 .map(|finding| serde_json::json!({
314 "detector": finding.detector,
315 "source": finding.source,
316 "line": finding.line,
317 "fingerprint": finding.fingerprint,
318 "redacted": finding.redacted,
319 }))
320 .collect::<Vec<_>>(),
321 });
322 let metadata = metadata
323 .as_object()
324 .cloned()
325 .map(|object| object.into_iter().collect::<BTreeMap<_, _>>())
326 .unwrap_or_default();
327 crate::events::log_info_meta("secret_scan.audit", "secret scan completed", metadata);
328}
329
330#[cfg(test)]
331mod tests {
332 use super::*;
333
334 use crate::event_log::{EventLog, MemoryEventLog};
335 use std::collections::BTreeSet;
336
337 #[test]
338 fn scan_content_detects_specific_rules_and_entropy_rule() {
339 let findings = scan_content(
340 r#"
341github_token = "ghp_1234567890abcdefghijklmnopqrstuvwxyzAB"
342config = { client_secret: "QWxhZGRpbjpPcGVuU2VzYW1lQWNjZXNzVG9rZW4=" }
343"#,
344 );
345
346 assert!(findings
347 .iter()
348 .any(|finding| finding.detector == "github-token"));
349 assert!(findings
350 .iter()
351 .any(|finding| finding.detector == "high-entropy-credential-assignment"));
352 assert!(!findings
353 .iter()
354 .any(|finding| finding.detector == "sensitive-assignment"));
355 }
356
357 #[test]
358 fn scan_content_deduplicates_generic_assignment_overlaps() {
359 let findings = scan_content(r#"token = "ghp_1234567890abcdefghijklmnopqrstuvwxyzAB""#);
360 assert_eq!(findings.len(), 1);
361 assert_eq!(findings[0].detector, "github-token");
362 }
363
364 #[test]
365 fn scan_content_keeps_generic_assignment_without_specific_detector() {
366 let findings = scan_content(r#"token = "secret123""#);
367 assert_eq!(findings.len(), 1);
368 assert_eq!(findings[0].detector, "sensitive-assignment");
369 }
370
371 #[test]
372 fn scan_content_preserves_source_declarations_with_secretish_identifiers() {
373 let findings = scan_content("pub const Token = struct { kind: u8 };\n");
374 assert!(findings.is_empty());
375 }
376
377 #[test]
378 fn scan_content_redacts_private_key_blocks() {
379 let findings = scan_content(
380 "-----BEGIN OPENSSH PRIVATE KEY-----\nZXhhbXBsZQ==\n-----END OPENSSH PRIVATE KEY-----\n",
381 );
382 assert_eq!(findings.len(), 1);
383 assert_eq!(findings[0].detector, "private-key-block");
384 assert_eq!(
385 findings[0].end_offset - findings[0].start_offset,
386 "-----BEGIN OPENSSH PRIVATE KEY-----\nZXhhbXBsZQ==\n-----END OPENSSH PRIVATE KEY-----"
387 .len()
388 );
389 assert_eq!(
390 findings[0].redacted,
391 "-----BEGIN OPENSSH PRIVATE KEY----- …"
392 );
393 }
394
395 #[test]
396 fn scan_content_covers_redaction_only_token_shapes() {
397 let findings = scan_content(
398 "Authorization: Bearer abcDEFghi123_-+/=xyz\njwt=eyJabcd.eyJefgh.signature_pad\n",
399 );
400 let detectors = findings
401 .iter()
402 .map(|finding| finding.detector.as_str())
403 .collect::<BTreeSet<_>>();
404 assert!(detectors.contains("bearer-token"));
405 assert!(detectors.contains("jwt-token"));
406 }
407
408 #[test]
409 fn scan_content_covers_ai_provider_token_shapes() {
410 let huggingface = format!("hf_{}", "a".repeat(24));
411 let cerebras = format!("csk-{}", "b".repeat(48));
412 let together = format!("tgp_v1_{}", "c".repeat(32));
413 let google = format!("AIza{}", "D".repeat(35));
414 let content = format!("{huggingface}\n{cerebras}\n{together}\n{google}\n");
415
416 let findings = scan_content(&content);
417 let detectors = findings
418 .iter()
419 .map(|finding| (finding.detector.as_str(), finding.source.as_str()))
420 .collect::<BTreeSet<_>>();
421
422 assert!(detectors.contains(&("huggingface-token", "huggingface-docs")));
423 assert!(detectors.contains(&("cerebras-api-key", "cerebras-docs")));
424 assert!(detectors.contains(&("together-api-key", "together-bug-report")));
425 assert!(detectors.contains(&("google-api-key", "microsoft-purview")));
426 for secret in [&huggingface, &cerebras, &together, &google] {
427 assert!(!findings
428 .iter()
429 .any(|finding| finding.redacted.contains(secret)));
430 }
431 }
432
433 #[tokio::test(flavor = "current_thread")]
434 async fn append_secret_scan_audit_writes_redacted_event() {
435 let log = MemoryEventLog::new(32);
436 let findings = scan_content(r#"token = "sk-abcdefghijklmnopqrstuvwx123456""#);
437 append_secret_scan_audit(&log, "test.secret_scan", 44, &findings)
438 .await
439 .unwrap();
440
441 let topic = Topic::new(SECRET_SCAN_AUDIT_TOPIC).unwrap();
442 let events = log.read_range(&topic, None, 10).await.unwrap();
443 assert_eq!(events.len(), 1);
444 assert_eq!(events[0].1.kind, "scan_detected");
445 assert_eq!(events[0].1.payload["caller"], "test.secret_scan");
446 let redacted = events[0].1.payload["findings"][0]["redacted"]
447 .as_str()
448 .unwrap();
449 assert!(redacted.contains('…'));
450 assert!(!redacted.contains("abcdefghijklmnopqrstuvwx123456"));
451 }
452}