Skip to main content

harn_vm/stdlib/
secret_scan.rs

1use std::collections::BTreeMap;
2use std::sync::LazyLock;
3
4use regex::Regex;
5use serde::{Deserialize, Serialize};
6use sha2::Digest;
7
8use crate::event_log::{active_event_log, EventLog, LogEvent, Topic};
9use crate::secret_patterns::{SecretPatternSpec, DEFAULT_SECRET_PATTERN_SPECS};
10use crate::value::{VmError, VmValue};
11use crate::vm::Vm;
12
13pub const SECRET_SCAN_AUDIT_TOPIC: &str = "audit.secret_scan";
14const HIGH_ENTROPY_THRESHOLD: f64 = 3.5;
15
16#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
17pub struct SecretFinding {
18    pub detector: String,
19    pub source: String,
20    pub title: String,
21    pub line: usize,
22    pub column_start: usize,
23    pub column_end: usize,
24    pub start_offset: usize,
25    pub end_offset: usize,
26    pub redacted: String,
27    pub fingerprint: String,
28}
29
30struct SecretRule {
31    spec: &'static SecretPatternSpec,
32    regex: Regex,
33}
34
35static SECRET_RULES: LazyLock<Vec<SecretRule>> = LazyLock::new(|| {
36    DEFAULT_SECRET_PATTERN_SPECS
37        .iter()
38        .map(|spec| SecretRule {
39            spec,
40            regex: Regex::new(spec.regex).unwrap_or_else(|error| {
41                panic!("invalid {} secret scan regex: {error}", spec.detector)
42            }),
43        })
44        .collect()
45});
46
47static HIGH_ENTROPY_ASSIGNMENT_RULE: LazyLock<Regex> = LazyLock::new(|| {
48    Regex::new(
49        r#"(?im)(?:secret|token|api[_-]?key|access[_-]?key|password|passwd|pwd|client[_-]?secret|private[_-]?key)[^\n:=]{0,32}(?::|=)\s*["']([A-Za-z0-9+/=_\.-]{20,})["']"#,
50    )
51    .unwrap()
52});
53
54pub fn scan_content(content: &str) -> Vec<SecretFinding> {
55    let line_starts = line_starts(content);
56    let mut findings = Vec::new();
57
58    for rule in SECRET_RULES.iter() {
59        for mat in rule.regex.find_iter(content) {
60            findings.push(build_finding(
61                content,
62                &line_starts,
63                rule.spec.detector,
64                rule.spec.source,
65                rule.spec.title,
66                mat.start(),
67                mat.end(),
68                mat.as_str(),
69            ));
70        }
71    }
72
73    for captures in HIGH_ENTROPY_ASSIGNMENT_RULE.captures_iter(content) {
74        let Some(secret) = captures.get(1) else {
75            continue;
76        };
77        if shannon_entropy(secret.as_str()) < HIGH_ENTROPY_THRESHOLD {
78            continue;
79        }
80        findings.push(build_finding(
81            content,
82            &line_starts,
83            "high-entropy-credential-assignment",
84            "trufflehog",
85            "High-entropy secret assignment",
86            secret.start(),
87            secret.end(),
88            secret.as_str(),
89        ));
90    }
91
92    findings.sort_by(|left, right| {
93        left.start_offset
94            .cmp(&right.start_offset)
95            .then(left.end_offset.cmp(&right.end_offset))
96            .then(left.detector.cmp(&right.detector))
97    });
98    let higher_specificity_spans = findings
99        .iter()
100        .map(|finding| {
101            (
102                finding.start_offset,
103                finding.end_offset,
104                detector_specificity(&finding.detector),
105            )
106        })
107        .collect::<Vec<_>>();
108    findings.retain(|finding| {
109        let specificity = detector_specificity(&finding.detector);
110        !higher_specificity_spans
111            .iter()
112            .any(|(start, end, other_specificity)| {
113                *other_specificity > specificity
114                    && spans_overlap((finding.start_offset, finding.end_offset), (*start, *end))
115            })
116    });
117    findings.dedup_by(|left, right| {
118        left.detector == right.detector
119            && left.start_offset == right.start_offset
120            && left.end_offset == right.end_offset
121    });
122    findings
123}
124
125fn detector_specificity(detector: &str) -> u8 {
126    match detector {
127        "sensitive-assignment" => 0,
128        "high-entropy-credential-assignment" => 1,
129        _ => 2,
130    }
131}
132
133fn spans_overlap(left: (usize, usize), right: (usize, usize)) -> bool {
134    left.0 < right.1 && right.0 < left.1
135}
136
137pub async fn append_secret_scan_audit<L: EventLog + ?Sized>(
138    event_log: &L,
139    caller: &str,
140    content_len: usize,
141    findings: &[SecretFinding],
142) -> Result<(), crate::event_log::LogError> {
143    let payload = serde_json::json!({
144        "caller": caller,
145        "content_len": content_len,
146        "finding_count": findings.len(),
147        "clean": findings.is_empty(),
148        "findings": findings
149            .iter()
150            .map(|finding| {
151                serde_json::json!({
152                    "detector": finding.detector,
153                    "source": finding.source,
154                    "title": finding.title,
155                    "line": finding.line,
156                    "column_start": finding.column_start,
157                    "column_end": finding.column_end,
158                    "start_offset": finding.start_offset,
159                    "end_offset": finding.end_offset,
160                    "fingerprint": finding.fingerprint,
161                    "redacted": finding.redacted,
162                })
163            })
164            .collect::<Vec<_>>(),
165        "observed_at": crate::orchestration::now_rfc3339(),
166    });
167    let topic = Topic::new(SECRET_SCAN_AUDIT_TOPIC).expect("secret scan audit topic is valid");
168    let kind = if findings.is_empty() {
169        "scan_clean"
170    } else {
171        "scan_detected"
172    };
173    event_log
174        .append(&topic, LogEvent::new(kind, payload))
175        .await?;
176    Ok(())
177}
178
179pub async fn audit_secret_scan_active(
180    caller: &str,
181    content_len: usize,
182    findings: &[SecretFinding],
183) {
184    emit_secret_scan_log(caller, content_len, findings);
185
186    let Some(event_log) = active_event_log() else {
187        return;
188    };
189
190    if let Err(error) =
191        append_secret_scan_audit(event_log.as_ref(), caller, content_len, findings).await
192    {
193        crate::events::log_warn(
194            "secret_scan.audit",
195            &format!("failed to append secret scan audit event: {error}"),
196        );
197    }
198}
199
200pub(crate) fn register_secret_scan_builtins(vm: &mut Vm) {
201    vm.register_async_builtin("secret_scan", |_ctx, args| async move {
202        let content = match args.first() {
203            Some(VmValue::Nil) | None => {
204                return Err(VmError::Runtime("secret_scan: content is required".into()));
205            }
206            Some(value) => value.display(),
207        };
208
209        let findings = scan_content(&content);
210        audit_secret_scan_active("stdlib.secret_scan", content.len(), &findings).await;
211
212        let value = serde_json::to_value(findings)
213            .map_err(|error| VmError::Runtime(format!("secret_scan: {error}")))?;
214        Ok(crate::schema::json_to_vm_value(&value))
215    });
216}
217
218fn build_finding(
219    content: &str,
220    line_starts: &[usize],
221    detector: &str,
222    source: &str,
223    title: &str,
224    start_offset: usize,
225    end_offset: usize,
226    matched: &str,
227) -> SecretFinding {
228    let (line, column_start) = offset_to_line_col(content, line_starts, start_offset);
229    let (_, column_end) = offset_to_line_col(content, line_starts, end_offset);
230    SecretFinding {
231        detector: detector.to_string(),
232        source: source.to_string(),
233        title: title.to_string(),
234        line,
235        column_start,
236        column_end,
237        start_offset,
238        end_offset,
239        redacted: redact_match(matched),
240        fingerprint: fingerprint(matched),
241    }
242}
243
244fn line_starts(content: &str) -> Vec<usize> {
245    let mut starts = vec![0];
246    for (index, byte) in content.bytes().enumerate() {
247        if byte == b'\n' {
248            starts.push(index + 1);
249        }
250    }
251    starts
252}
253
254fn offset_to_line_col(content: &str, line_starts: &[usize], offset: usize) -> (usize, usize) {
255    let line_index = line_starts
256        .partition_point(|start| *start <= offset)
257        .saturating_sub(1);
258    let line_start = line_starts[line_index];
259    let column = content[line_start..offset].chars().count() + 1;
260    (line_index + 1, column)
261}
262
263fn redact_match(matched: &str) -> String {
264    if matched.starts_with("-----BEGIN ") {
265        return format!(
266            "{} …",
267            matched
268                .lines()
269                .next()
270                .unwrap_or("-----BEGIN PRIVATE KEY-----")
271        );
272    }
273
274    let chars: Vec<char> = matched.chars().collect();
275    if chars.len() <= 8 {
276        return "*".repeat(chars.len());
277    }
278    let prefix: String = chars.iter().take(4).collect();
279    let suffix: String = chars[chars.len().saturating_sub(4)..].iter().collect();
280    format!("{prefix}…{suffix}")
281}
282
283fn fingerprint(matched: &str) -> String {
284    let hash = sha2::Sha256::digest(matched.as_bytes());
285    let hex: String = hash.iter().map(|byte| format!("{byte:02x}")).collect();
286    hex[..16].to_string()
287}
288
289fn shannon_entropy(value: &str) -> f64 {
290    let mut counts = BTreeMap::new();
291    for ch in value.chars() {
292        *counts.entry(ch).or_insert(0usize) += 1;
293    }
294    let len = value.chars().count() as f64;
295    counts
296        .values()
297        .map(|count| {
298            let probability = *count as f64 / len;
299            -(probability * probability.log2())
300        })
301        .sum()
302}
303
304fn emit_secret_scan_log(caller: &str, content_len: usize, findings: &[SecretFinding]) {
305    let metadata = serde_json::json!({
306        "topic": SECRET_SCAN_AUDIT_TOPIC,
307        "caller": caller,
308        "content_len": content_len,
309        "finding_count": findings.len(),
310        "clean": findings.is_empty(),
311        "findings": findings
312            .iter()
313            .map(|finding| serde_json::json!({
314                "detector": finding.detector,
315                "source": finding.source,
316                "line": finding.line,
317                "fingerprint": finding.fingerprint,
318                "redacted": finding.redacted,
319            }))
320            .collect::<Vec<_>>(),
321    });
322    let metadata = metadata
323        .as_object()
324        .cloned()
325        .map(|object| object.into_iter().collect::<BTreeMap<_, _>>())
326        .unwrap_or_default();
327    crate::events::log_info_meta("secret_scan.audit", "secret scan completed", metadata);
328}
329
330#[cfg(test)]
331mod tests {
332    use super::*;
333
334    use crate::event_log::{EventLog, MemoryEventLog};
335    use std::collections::BTreeSet;
336
337    #[test]
338    fn scan_content_detects_specific_rules_and_entropy_rule() {
339        let findings = scan_content(
340            r#"
341github_token = "ghp_1234567890abcdefghijklmnopqrstuvwxyzAB"
342config = { client_secret: "QWxhZGRpbjpPcGVuU2VzYW1lQWNjZXNzVG9rZW4=" }
343"#,
344        );
345
346        assert!(findings
347            .iter()
348            .any(|finding| finding.detector == "github-token"));
349        assert!(findings
350            .iter()
351            .any(|finding| finding.detector == "high-entropy-credential-assignment"));
352        assert!(!findings
353            .iter()
354            .any(|finding| finding.detector == "sensitive-assignment"));
355    }
356
357    #[test]
358    fn scan_content_deduplicates_generic_assignment_overlaps() {
359        let findings = scan_content(r#"token = "ghp_1234567890abcdefghijklmnopqrstuvwxyzAB""#);
360        assert_eq!(findings.len(), 1);
361        assert_eq!(findings[0].detector, "github-token");
362    }
363
364    #[test]
365    fn scan_content_keeps_generic_assignment_without_specific_detector() {
366        let findings = scan_content(r#"token = "secret123""#);
367        assert_eq!(findings.len(), 1);
368        assert_eq!(findings[0].detector, "sensitive-assignment");
369    }
370
371    #[test]
372    fn scan_content_preserves_source_declarations_with_secretish_identifiers() {
373        let findings = scan_content("pub const Token = struct { kind: u8 };\n");
374        assert!(findings.is_empty());
375    }
376
377    #[test]
378    fn scan_content_redacts_private_key_blocks() {
379        let findings = scan_content(
380            "-----BEGIN OPENSSH PRIVATE KEY-----\nZXhhbXBsZQ==\n-----END OPENSSH PRIVATE KEY-----\n",
381        );
382        assert_eq!(findings.len(), 1);
383        assert_eq!(findings[0].detector, "private-key-block");
384        assert_eq!(
385            findings[0].end_offset - findings[0].start_offset,
386            "-----BEGIN OPENSSH PRIVATE KEY-----\nZXhhbXBsZQ==\n-----END OPENSSH PRIVATE KEY-----"
387                .len()
388        );
389        assert_eq!(
390            findings[0].redacted,
391            "-----BEGIN OPENSSH PRIVATE KEY----- …"
392        );
393    }
394
395    #[test]
396    fn scan_content_covers_redaction_only_token_shapes() {
397        let findings = scan_content(
398            "Authorization: Bearer abcDEFghi123_-+/=xyz\njwt=eyJabcd.eyJefgh.signature_pad\n",
399        );
400        let detectors = findings
401            .iter()
402            .map(|finding| finding.detector.as_str())
403            .collect::<BTreeSet<_>>();
404        assert!(detectors.contains("bearer-token"));
405        assert!(detectors.contains("jwt-token"));
406    }
407
408    #[test]
409    fn scan_content_covers_ai_provider_token_shapes() {
410        let huggingface = format!("hf_{}", "a".repeat(24));
411        let cerebras = format!("csk-{}", "b".repeat(48));
412        let together = format!("tgp_v1_{}", "c".repeat(32));
413        let google = format!("AIza{}", "D".repeat(35));
414        let content = format!("{huggingface}\n{cerebras}\n{together}\n{google}\n");
415
416        let findings = scan_content(&content);
417        let detectors = findings
418            .iter()
419            .map(|finding| (finding.detector.as_str(), finding.source.as_str()))
420            .collect::<BTreeSet<_>>();
421
422        assert!(detectors.contains(&("huggingface-token", "huggingface-docs")));
423        assert!(detectors.contains(&("cerebras-api-key", "cerebras-docs")));
424        assert!(detectors.contains(&("together-api-key", "together-bug-report")));
425        assert!(detectors.contains(&("google-api-key", "microsoft-purview")));
426        for secret in [&huggingface, &cerebras, &together, &google] {
427            assert!(!findings
428                .iter()
429                .any(|finding| finding.redacted.contains(secret)));
430        }
431    }
432
433    #[tokio::test(flavor = "current_thread")]
434    async fn append_secret_scan_audit_writes_redacted_event() {
435        let log = MemoryEventLog::new(32);
436        let findings = scan_content(r#"token = "sk-abcdefghijklmnopqrstuvwx123456""#);
437        append_secret_scan_audit(&log, "test.secret_scan", 44, &findings)
438            .await
439            .unwrap();
440
441        let topic = Topic::new(SECRET_SCAN_AUDIT_TOPIC).unwrap();
442        let events = log.read_range(&topic, None, 10).await.unwrap();
443        assert_eq!(events.len(), 1);
444        assert_eq!(events[0].1.kind, "scan_detected");
445        assert_eq!(events[0].1.payload["caller"], "test.secret_scan");
446        let redacted = events[0].1.payload["findings"][0]["redacted"]
447            .as_str()
448            .unwrap();
449        assert!(redacted.contains('…'));
450        assert!(!redacted.contains("abcdefghijklmnopqrstuvwx123456"));
451    }
452}