1use std::collections::BTreeMap;
2use std::sync::LazyLock;
3
4use regex::Regex;
5use serde::{Deserialize, Serialize};
6use sha2::Digest;
7
8use crate::event_log::{active_event_log, EventLog, LogEvent, Topic};
9use crate::secret_patterns::{SecretPatternSpec, DEFAULT_SECRET_PATTERN_SPECS};
10use crate::value::{VmError, VmValue};
11use crate::vm::Vm;
12
13pub const SECRET_SCAN_AUDIT_TOPIC: &str = "audit.secret_scan";
14const HIGH_ENTROPY_THRESHOLD: f64 = 3.5;
15
16#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
17pub struct SecretFinding {
18 pub detector: String,
19 pub source: String,
20 pub title: String,
21 pub precision: String,
26 pub line: usize,
27 pub column_start: usize,
28 pub column_end: usize,
29 pub start_offset: usize,
30 pub end_offset: usize,
31 pub redacted: String,
32 pub fingerprint: String,
33}
34
35struct SecretRule {
36 spec: &'static SecretPatternSpec,
37 regex: Regex,
38}
39
40static SECRET_RULES: LazyLock<Vec<SecretRule>> = LazyLock::new(|| {
41 DEFAULT_SECRET_PATTERN_SPECS
42 .iter()
43 .map(|spec| SecretRule {
44 spec,
45 regex: Regex::new(spec.regex).unwrap_or_else(|error| {
46 panic!("invalid {} secret scan regex: {error}", spec.detector)
47 }),
48 })
49 .collect()
50});
51
52static HIGH_ENTROPY_ASSIGNMENT_RULE: LazyLock<Regex> = LazyLock::new(|| {
53 Regex::new(
54 r#"(?im)(?:secret|token|api[_-]?key|access[_-]?key|password|passwd|pwd|client[_-]?secret|private[_-]?key)[^\n:=]{0,32}(?::|=)\s*["']([A-Za-z0-9+/=_\.-]{20,})["']"#,
55 )
56 .unwrap()
57});
58
59pub fn scan_content(content: &str) -> Vec<SecretFinding> {
60 let line_starts = line_starts(content);
61 let mut findings = Vec::new();
62
63 for rule in SECRET_RULES.iter() {
64 for mat in rule.regex.find_iter(content) {
65 findings.push(build_finding(
66 content,
67 &line_starts,
68 rule.spec.detector,
69 rule.spec.source,
70 rule.spec.title,
71 rule.spec.precision,
72 mat.start(),
73 mat.end(),
74 mat.as_str(),
75 ));
76 }
77 }
78
79 for captures in HIGH_ENTROPY_ASSIGNMENT_RULE.captures_iter(content) {
80 let Some(secret) = captures.get(1) else {
81 continue;
82 };
83 if shannon_entropy(secret.as_str()) < HIGH_ENTROPY_THRESHOLD {
84 continue;
85 }
86 findings.push(build_finding(
87 content,
88 &line_starts,
89 "high-entropy-credential-assignment",
90 "trufflehog",
91 "High-entropy secret assignment",
92 crate::secret_patterns::PRECISION_HEURISTIC,
93 secret.start(),
94 secret.end(),
95 secret.as_str(),
96 ));
97 }
98
99 findings.sort_by(|left, right| {
100 left.start_offset
101 .cmp(&right.start_offset)
102 .then(left.end_offset.cmp(&right.end_offset))
103 .then(left.detector.cmp(&right.detector))
104 });
105 let higher_specificity_spans = findings
106 .iter()
107 .map(|finding| {
108 (
109 finding.start_offset,
110 finding.end_offset,
111 detector_specificity(&finding.detector),
112 )
113 })
114 .collect::<Vec<_>>();
115 findings.retain(|finding| {
116 let specificity = detector_specificity(&finding.detector);
117 !higher_specificity_spans
118 .iter()
119 .any(|(start, end, other_specificity)| {
120 *other_specificity > specificity
121 && spans_overlap((finding.start_offset, finding.end_offset), (*start, *end))
122 })
123 });
124 findings.dedup_by(|left, right| {
125 left.detector == right.detector
126 && left.start_offset == right.start_offset
127 && left.end_offset == right.end_offset
128 });
129 findings
130}
131
132fn detector_specificity(detector: &str) -> u8 {
133 match detector {
134 "sensitive-assignment" => 0,
135 "high-entropy-credential-assignment" => 1,
136 _ => 2,
137 }
138}
139
140fn spans_overlap(left: (usize, usize), right: (usize, usize)) -> bool {
141 left.0 < right.1 && right.0 < left.1
142}
143
144pub async fn append_secret_scan_audit<L: EventLog + ?Sized>(
145 event_log: &L,
146 caller: &str,
147 content_len: usize,
148 findings: &[SecretFinding],
149) -> Result<(), crate::event_log::LogError> {
150 let payload = serde_json::json!({
151 "caller": caller,
152 "content_len": content_len,
153 "finding_count": findings.len(),
154 "clean": findings.is_empty(),
155 "findings": findings
156 .iter()
157 .map(|finding| {
158 serde_json::json!({
159 "detector": finding.detector,
160 "source": finding.source,
161 "title": finding.title,
162 "precision": finding.precision,
163 "line": finding.line,
164 "column_start": finding.column_start,
165 "column_end": finding.column_end,
166 "start_offset": finding.start_offset,
167 "end_offset": finding.end_offset,
168 "fingerprint": finding.fingerprint,
169 "redacted": finding.redacted,
170 })
171 })
172 .collect::<Vec<_>>(),
173 "observed_at": crate::orchestration::now_rfc3339(),
174 });
175 let topic = Topic::new(SECRET_SCAN_AUDIT_TOPIC).expect("secret scan audit topic is valid");
176 let kind = if findings.is_empty() {
177 "scan_clean"
178 } else {
179 "scan_detected"
180 };
181 event_log
182 .append(&topic, LogEvent::new(kind, payload))
183 .await?;
184 Ok(())
185}
186
187pub async fn audit_secret_scan_active(
188 caller: &str,
189 content_len: usize,
190 findings: &[SecretFinding],
191) {
192 emit_secret_scan_log(caller, content_len, findings);
193
194 let Some(event_log) = active_event_log() else {
195 return;
196 };
197
198 if let Err(error) =
199 append_secret_scan_audit(event_log.as_ref(), caller, content_len, findings).await
200 {
201 crate::events::log_warn(
202 "secret_scan.audit",
203 &format!("failed to append secret scan audit event: {error}"),
204 );
205 }
206}
207
208pub(crate) fn register_secret_scan_builtins(vm: &mut Vm) {
209 vm.register_async_builtin("secret_scan", |_ctx, args| async move {
210 let content = match args.first() {
211 Some(VmValue::Nil) | None => {
212 return Err(VmError::Runtime("secret_scan: content is required".into()));
213 }
214 Some(value) => value.display(),
215 };
216 let audit = secret_scan_audit_option(args.get(1))?;
217
218 let findings = scan_content(&content);
219 if audit {
220 audit_secret_scan_active("stdlib.secret_scan", content.len(), &findings).await;
221 }
222
223 let value = serde_json::to_value(findings)
224 .map_err(|error| VmError::Runtime(format!("secret_scan: {error}")))?;
225 Ok(crate::schema::json_to_vm_value(&value))
226 });
227}
228
229fn secret_scan_audit_option(value: Option<&VmValue>) -> Result<bool, VmError> {
235 let map = match value {
236 None | Some(VmValue::Nil) => return Ok(true),
237 Some(VmValue::Dict(map)) => map,
238 Some(other) => {
239 return Err(VmError::Runtime(format!(
240 "secret_scan: options must be a dict or nil; got {}",
241 other.type_name()
242 )));
243 }
244 };
245 match map.get("audit") {
246 None | Some(VmValue::Nil) => Ok(true),
247 Some(VmValue::Bool(flag)) => Ok(*flag),
248 Some(other) => Err(VmError::Runtime(format!(
249 "secret_scan: options.audit must be a bool; got {}",
250 other.type_name()
251 ))),
252 }
253}
254
255#[allow(clippy::too_many_arguments)]
256fn build_finding(
257 content: &str,
258 line_starts: &[usize],
259 detector: &str,
260 source: &str,
261 title: &str,
262 precision: &str,
263 start_offset: usize,
264 end_offset: usize,
265 matched: &str,
266) -> SecretFinding {
267 let (line, column_start) = offset_to_line_col(content, line_starts, start_offset);
268 let (_, column_end) = offset_to_line_col(content, line_starts, end_offset);
269 SecretFinding {
270 detector: detector.to_string(),
271 source: source.to_string(),
272 title: title.to_string(),
273 precision: precision.to_string(),
274 line,
275 column_start,
276 column_end,
277 start_offset,
278 end_offset,
279 redacted: redact_match(matched),
280 fingerprint: fingerprint(matched),
281 }
282}
283
284fn line_starts(content: &str) -> Vec<usize> {
285 let mut starts = vec![0];
286 for (index, byte) in content.bytes().enumerate() {
287 if byte == b'\n' {
288 starts.push(index + 1);
289 }
290 }
291 starts
292}
293
294fn offset_to_line_col(content: &str, line_starts: &[usize], offset: usize) -> (usize, usize) {
295 let line_index = line_starts
296 .partition_point(|start| *start <= offset)
297 .saturating_sub(1);
298 let line_start = line_starts[line_index];
299 let column = content[line_start..offset].chars().count() + 1;
300 (line_index + 1, column)
301}
302
303fn redact_match(matched: &str) -> String {
304 if matched.starts_with("-----BEGIN ") {
305 return format!(
306 "{} …",
307 matched
308 .lines()
309 .next()
310 .unwrap_or("-----BEGIN PRIVATE KEY-----")
311 );
312 }
313
314 let chars: Vec<char> = matched.chars().collect();
315 if chars.len() <= 8 {
316 return "*".repeat(chars.len());
317 }
318 let prefix: String = chars.iter().take(4).collect();
319 let suffix: String = chars[chars.len().saturating_sub(4)..].iter().collect();
320 format!("{prefix}…{suffix}")
321}
322
323fn fingerprint(matched: &str) -> String {
324 let hash = sha2::Sha256::digest(matched.as_bytes());
325 let hex: String = hash.iter().map(|byte| format!("{byte:02x}")).collect();
326 hex[..16].to_string()
327}
328
329fn shannon_entropy(value: &str) -> f64 {
330 let mut counts = BTreeMap::new();
331 for ch in value.chars() {
332 *counts.entry(ch).or_insert(0usize) += 1;
333 }
334 let len = value.chars().count() as f64;
335 counts
336 .values()
337 .map(|count| {
338 let probability = *count as f64 / len;
339 -(probability * probability.log2())
340 })
341 .sum()
342}
343
344fn emit_secret_scan_log(caller: &str, content_len: usize, findings: &[SecretFinding]) {
345 let metadata = serde_json::json!({
346 "topic": SECRET_SCAN_AUDIT_TOPIC,
347 "caller": caller,
348 "content_len": content_len,
349 "finding_count": findings.len(),
350 "clean": findings.is_empty(),
351 "findings": findings
352 .iter()
353 .map(|finding| serde_json::json!({
354 "detector": finding.detector,
355 "source": finding.source,
356 "line": finding.line,
357 "fingerprint": finding.fingerprint,
358 "redacted": finding.redacted,
359 }))
360 .collect::<Vec<_>>(),
361 });
362 let metadata = metadata
363 .as_object()
364 .cloned()
365 .map(|object| object.into_iter().collect::<BTreeMap<_, _>>())
366 .unwrap_or_default();
367 crate::events::log_info_meta("secret_scan.audit", "secret scan completed", metadata);
368}
369
370#[cfg(test)]
371mod tests {
372 use super::*;
373
374 use crate::event_log::{EventLog, MemoryEventLog};
375 use std::collections::BTreeSet;
376
377 #[test]
378 fn scan_content_detects_specific_rules_and_entropy_rule() {
379 let findings = scan_content(
380 r#"
381github_token = "ghp_1234567890abcdefghijklmnopqrstuvwxyzAB"
382config = { client_secret: "QWxhZGRpbjpPcGVuU2VzYW1lQWNjZXNzVG9rZW4=" }
383"#,
384 );
385
386 assert!(findings
387 .iter()
388 .any(|finding| finding.detector == "github-token"));
389 assert!(findings
390 .iter()
391 .any(|finding| finding.detector == "high-entropy-credential-assignment"));
392 assert!(!findings
393 .iter()
394 .any(|finding| finding.detector == "sensitive-assignment"));
395 }
396
397 #[test]
398 fn scan_content_deduplicates_generic_assignment_overlaps() {
399 let findings = scan_content(r#"token = "ghp_1234567890abcdefghijklmnopqrstuvwxyzAB""#);
400 assert_eq!(findings.len(), 1);
401 assert_eq!(findings[0].detector, "github-token");
402 }
403
404 #[test]
405 fn precision_class_splits_token_shapes_from_keyword_heuristics() {
406 let findings = scan_content(
407 "ghp_1234567890abcdefghijklmnopqrstuvwxyzAB\npassword = \"s3cr3t-value-here\"",
408 );
409 let precision = |detector: &str| {
410 findings
411 .iter()
412 .find(|finding| finding.detector == detector)
413 .map(|finding| finding.precision.as_str())
414 };
415 assert_eq!(precision("github-token"), Some("high"));
417 assert_eq!(precision("sensitive-assignment"), Some("heuristic"));
419 assert!(findings
421 .iter()
422 .all(|finding| finding.precision == "high" || finding.precision == "heuristic"));
423 }
424
425 #[test]
426 fn scan_content_keeps_generic_assignment_without_specific_detector() {
427 let findings = scan_content(r#"token = "secret123""#);
428 assert_eq!(findings.len(), 1);
429 assert_eq!(findings[0].detector, "sensitive-assignment");
430 }
431
432 #[test]
433 fn scan_content_preserves_source_declarations_with_secretish_identifiers() {
434 let findings = scan_content("pub const Token = struct { kind: u8 };\n");
435 assert!(findings.is_empty());
436 }
437
438 #[test]
439 fn scan_content_redacts_private_key_blocks() {
440 let findings = scan_content(
441 "-----BEGIN OPENSSH PRIVATE KEY-----\nZXhhbXBsZQ==\n-----END OPENSSH PRIVATE KEY-----\n",
442 );
443 assert_eq!(findings.len(), 1);
444 assert_eq!(findings[0].detector, "private-key-block");
445 assert_eq!(
446 findings[0].end_offset - findings[0].start_offset,
447 "-----BEGIN OPENSSH PRIVATE KEY-----\nZXhhbXBsZQ==\n-----END OPENSSH PRIVATE KEY-----"
448 .len()
449 );
450 assert_eq!(
451 findings[0].redacted,
452 "-----BEGIN OPENSSH PRIVATE KEY----- …"
453 );
454 }
455
456 #[test]
457 fn scan_content_covers_redaction_only_token_shapes() {
458 let findings = scan_content(
459 "Authorization: Bearer abcDEFghi123_-+/=xyz\njwt=eyJabcd.eyJefgh.signature_pad\n",
460 );
461 let detectors = findings
462 .iter()
463 .map(|finding| finding.detector.as_str())
464 .collect::<BTreeSet<_>>();
465 assert!(detectors.contains("bearer-token"));
466 assert!(detectors.contains("jwt-token"));
467 }
468
469 #[test]
470 fn scan_content_covers_ai_provider_token_shapes() {
471 let huggingface = format!("hf_{}", "a".repeat(24));
472 let cerebras = format!("csk-{}", "b".repeat(48));
473 let together = format!("tgp_v1_{}", "c".repeat(32));
474 let google = format!("AIza{}", "D".repeat(35));
475 let content = format!("{huggingface}\n{cerebras}\n{together}\n{google}\n");
476
477 let findings = scan_content(&content);
478 let detectors = findings
479 .iter()
480 .map(|finding| (finding.detector.as_str(), finding.source.as_str()))
481 .collect::<BTreeSet<_>>();
482
483 assert!(detectors.contains(&("huggingface-token", "huggingface-docs")));
484 assert!(detectors.contains(&("cerebras-api-key", "cerebras-docs")));
485 assert!(detectors.contains(&("together-api-key", "together-bug-report")));
486 assert!(detectors.contains(&("google-api-key", "microsoft-purview")));
487 for secret in [&huggingface, &cerebras, &together, &google] {
488 assert!(!findings
489 .iter()
490 .any(|finding| finding.redacted.contains(secret)));
491 }
492 }
493
494 #[tokio::test(flavor = "current_thread")]
495 async fn append_secret_scan_audit_writes_redacted_event() {
496 let log = MemoryEventLog::new(32);
497 let findings = scan_content(r#"token = "sk-abcdefghijklmnopqrstuvwx123456""#);
498 append_secret_scan_audit(&log, "test.secret_scan", 44, &findings)
499 .await
500 .unwrap();
501
502 let topic = Topic::new(SECRET_SCAN_AUDIT_TOPIC).unwrap();
503 let events = log.read_range(&topic, None, 10).await.unwrap();
504 assert_eq!(events.len(), 1);
505 assert_eq!(events[0].1.kind, "scan_detected");
506 assert_eq!(events[0].1.payload["caller"], "test.secret_scan");
507 let redacted = events[0].1.payload["findings"][0]["redacted"]
508 .as_str()
509 .unwrap();
510 assert!(redacted.contains('…'));
511 assert!(!redacted.contains("abcdefghijklmnopqrstuvwx123456"));
512 }
513}