Skip to main content

harn_vm/stdlib/
secret_scan.rs

1use std::collections::{BTreeMap, BTreeSet};
2use std::sync::LazyLock;
3
4use regex::Regex;
5use serde::{Deserialize, Serialize};
6use sha2::Digest;
7
8use crate::event_log::{active_event_log, EventLog, LogEvent, Topic};
9use crate::secret_patterns::{SecretPatternSpec, DEFAULT_SECRET_PATTERN_SPECS};
10use crate::value::{VmError, VmValue};
11use crate::vm::Vm;
12
13pub const SECRET_SCAN_AUDIT_TOPIC: &str = "audit.secret_scan";
14const HIGH_ENTROPY_THRESHOLD: f64 = 3.5;
15
16#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
17pub struct SecretFinding {
18    pub detector: String,
19    pub source: String,
20    pub title: String,
21    pub line: usize,
22    pub column_start: usize,
23    pub column_end: usize,
24    pub start_offset: usize,
25    pub end_offset: usize,
26    pub redacted: String,
27    pub fingerprint: String,
28}
29
30struct SecretRule {
31    spec: &'static SecretPatternSpec,
32    regex: Regex,
33}
34
35static SECRET_RULES: LazyLock<Vec<SecretRule>> = LazyLock::new(|| {
36    DEFAULT_SECRET_PATTERN_SPECS
37        .iter()
38        .map(|spec| SecretRule {
39            spec,
40            regex: Regex::new(spec.regex).unwrap_or_else(|error| {
41                panic!("invalid {} secret scan regex: {error}", spec.detector)
42            }),
43        })
44        .collect()
45});
46
47static HIGH_ENTROPY_ASSIGNMENT_RULE: LazyLock<Regex> = LazyLock::new(|| {
48    Regex::new(
49        r#"(?im)(?:secret|token|api[_-]?key|access[_-]?key|password|passwd|pwd|client[_-]?secret|private[_-]?key)[^\n:=]{0,32}(?::|=)\s*["']([A-Za-z0-9+/=_\.-]{20,})["']"#,
50    )
51    .unwrap()
52});
53
54pub fn scan_content(content: &str) -> Vec<SecretFinding> {
55    let line_starts = line_starts(content);
56    let mut findings = Vec::new();
57
58    for rule in SECRET_RULES.iter() {
59        for mat in rule.regex.find_iter(content) {
60            findings.push(build_finding(
61                content,
62                &line_starts,
63                rule.spec.detector,
64                rule.spec.source,
65                rule.spec.title,
66                mat.start(),
67                mat.end(),
68                mat.as_str(),
69            ));
70        }
71    }
72
73    for captures in HIGH_ENTROPY_ASSIGNMENT_RULE.captures_iter(content) {
74        let Some(secret) = captures.get(1) else {
75            continue;
76        };
77        if shannon_entropy(secret.as_str()) < HIGH_ENTROPY_THRESHOLD {
78            continue;
79        }
80        findings.push(build_finding(
81            content,
82            &line_starts,
83            "high-entropy-credential-assignment",
84            "trufflehog",
85            "High-entropy secret assignment",
86            secret.start(),
87            secret.end(),
88            secret.as_str(),
89        ));
90    }
91
92    findings.sort_by(|left, right| {
93        left.start_offset
94            .cmp(&right.start_offset)
95            .then(left.end_offset.cmp(&right.end_offset))
96            .then(left.detector.cmp(&right.detector))
97    });
98    let specific_spans: BTreeSet<(usize, usize)> = findings
99        .iter()
100        .filter(|finding| finding.detector != "high-entropy-credential-assignment")
101        .map(|finding| (finding.start_offset, finding.end_offset))
102        .collect();
103    findings.retain(|finding| {
104        finding.detector != "high-entropy-credential-assignment"
105            || !specific_spans.contains(&(finding.start_offset, finding.end_offset))
106    });
107    findings.dedup_by(|left, right| {
108        left.detector == right.detector
109            && left.start_offset == right.start_offset
110            && left.end_offset == right.end_offset
111    });
112    findings
113}
114
115pub async fn append_secret_scan_audit<L: EventLog + ?Sized>(
116    event_log: &L,
117    caller: &str,
118    content_len: usize,
119    findings: &[SecretFinding],
120) -> Result<(), crate::event_log::LogError> {
121    let payload = serde_json::json!({
122        "caller": caller,
123        "content_len": content_len,
124        "finding_count": findings.len(),
125        "clean": findings.is_empty(),
126        "findings": findings
127            .iter()
128            .map(|finding| {
129                serde_json::json!({
130                    "detector": finding.detector,
131                    "source": finding.source,
132                    "title": finding.title,
133                    "line": finding.line,
134                    "column_start": finding.column_start,
135                    "column_end": finding.column_end,
136                    "start_offset": finding.start_offset,
137                    "end_offset": finding.end_offset,
138                    "fingerprint": finding.fingerprint,
139                    "redacted": finding.redacted,
140                })
141            })
142            .collect::<Vec<_>>(),
143        "observed_at": crate::orchestration::now_rfc3339(),
144    });
145    let topic = Topic::new(SECRET_SCAN_AUDIT_TOPIC).expect("secret scan audit topic is valid");
146    let kind = if findings.is_empty() {
147        "scan_clean"
148    } else {
149        "scan_detected"
150    };
151    event_log
152        .append(&topic, LogEvent::new(kind, payload))
153        .await?;
154    Ok(())
155}
156
157pub async fn audit_secret_scan_active(
158    caller: &str,
159    content_len: usize,
160    findings: &[SecretFinding],
161) {
162    emit_secret_scan_log(caller, content_len, findings);
163
164    let Some(event_log) = active_event_log() else {
165        return;
166    };
167
168    if let Err(error) =
169        append_secret_scan_audit(event_log.as_ref(), caller, content_len, findings).await
170    {
171        crate::events::log_warn(
172            "secret_scan.audit",
173            &format!("failed to append secret scan audit event: {error}"),
174        );
175    }
176}
177
178pub(crate) fn register_secret_scan_builtins(vm: &mut Vm) {
179    vm.register_async_builtin("secret_scan", |args| async move {
180        let content = match args.first() {
181            Some(VmValue::Nil) | None => {
182                return Err(VmError::Runtime("secret_scan: content is required".into()));
183            }
184            Some(value) => value.display(),
185        };
186
187        let findings = scan_content(&content);
188        audit_secret_scan_active("stdlib.secret_scan", content.len(), &findings).await;
189
190        let value = serde_json::to_value(findings)
191            .map_err(|error| VmError::Runtime(format!("secret_scan: {error}")))?;
192        Ok(crate::schema::json_to_vm_value(&value))
193    });
194}
195
196fn build_finding(
197    content: &str,
198    line_starts: &[usize],
199    detector: &str,
200    source: &str,
201    title: &str,
202    start_offset: usize,
203    end_offset: usize,
204    matched: &str,
205) -> SecretFinding {
206    let (line, column_start) = offset_to_line_col(content, line_starts, start_offset);
207    let (_, column_end) = offset_to_line_col(content, line_starts, end_offset);
208    SecretFinding {
209        detector: detector.to_string(),
210        source: source.to_string(),
211        title: title.to_string(),
212        line,
213        column_start,
214        column_end,
215        start_offset,
216        end_offset,
217        redacted: redact_match(matched),
218        fingerprint: fingerprint(matched),
219    }
220}
221
222fn line_starts(content: &str) -> Vec<usize> {
223    let mut starts = vec![0];
224    for (index, byte) in content.bytes().enumerate() {
225        if byte == b'\n' {
226            starts.push(index + 1);
227        }
228    }
229    starts
230}
231
232fn offset_to_line_col(content: &str, line_starts: &[usize], offset: usize) -> (usize, usize) {
233    let line_index = line_starts
234        .partition_point(|start| *start <= offset)
235        .saturating_sub(1);
236    let line_start = line_starts[line_index];
237    let column = content[line_start..offset].chars().count() + 1;
238    (line_index + 1, column)
239}
240
241fn redact_match(matched: &str) -> String {
242    if matched.starts_with("-----BEGIN ") {
243        return format!(
244            "{} …",
245            matched
246                .lines()
247                .next()
248                .unwrap_or("-----BEGIN PRIVATE KEY-----")
249        );
250    }
251
252    let chars: Vec<char> = matched.chars().collect();
253    if chars.len() <= 8 {
254        return "*".repeat(chars.len());
255    }
256    let prefix: String = chars.iter().take(4).collect();
257    let suffix: String = chars[chars.len().saturating_sub(4)..].iter().collect();
258    format!("{prefix}…{suffix}")
259}
260
261fn fingerprint(matched: &str) -> String {
262    let hash = sha2::Sha256::digest(matched.as_bytes());
263    let hex: String = hash.iter().map(|byte| format!("{byte:02x}")).collect();
264    hex[..16].to_string()
265}
266
267fn shannon_entropy(value: &str) -> f64 {
268    let mut counts = BTreeMap::new();
269    for ch in value.chars() {
270        *counts.entry(ch).or_insert(0usize) += 1;
271    }
272    let len = value.chars().count() as f64;
273    counts
274        .values()
275        .map(|count| {
276            let probability = *count as f64 / len;
277            -(probability * probability.log2())
278        })
279        .sum()
280}
281
282fn emit_secret_scan_log(caller: &str, content_len: usize, findings: &[SecretFinding]) {
283    let metadata = serde_json::json!({
284        "topic": SECRET_SCAN_AUDIT_TOPIC,
285        "caller": caller,
286        "content_len": content_len,
287        "finding_count": findings.len(),
288        "clean": findings.is_empty(),
289        "findings": findings
290            .iter()
291            .map(|finding| serde_json::json!({
292                "detector": finding.detector,
293                "source": finding.source,
294                "line": finding.line,
295                "fingerprint": finding.fingerprint,
296                "redacted": finding.redacted,
297            }))
298            .collect::<Vec<_>>(),
299    });
300    let metadata = metadata
301        .as_object()
302        .cloned()
303        .map(|object| object.into_iter().collect::<BTreeMap<_, _>>())
304        .unwrap_or_default();
305    crate::events::log_info_meta("secret_scan.audit", "secret scan completed", metadata);
306}
307
308#[cfg(test)]
309mod tests {
310    use super::*;
311
312    use crate::event_log::{EventLog, MemoryEventLog};
313
314    #[test]
315    fn scan_content_detects_specific_rules_and_entropy_rule() {
316        let findings = scan_content(
317            r#"
318github_token = "ghp_1234567890abcdefghijklmnopqrstuvwxyzAB"
319config = { client_secret: "QWxhZGRpbjpPcGVuU2VzYW1lQWNjZXNzVG9rZW4=" }
320"#,
321        );
322
323        assert!(findings
324            .iter()
325            .any(|finding| finding.detector == "github-token"));
326        assert!(findings
327            .iter()
328            .any(|finding| finding.detector == "high-entropy-credential-assignment"));
329    }
330
331    #[test]
332    fn scan_content_redacts_private_key_blocks() {
333        let findings = scan_content(
334            "-----BEGIN OPENSSH PRIVATE KEY-----\nZXhhbXBsZQ==\n-----END OPENSSH PRIVATE KEY-----\n",
335        );
336        assert_eq!(findings.len(), 1);
337        assert_eq!(findings[0].detector, "private-key-block");
338        assert_eq!(
339            findings[0].end_offset - findings[0].start_offset,
340            "-----BEGIN OPENSSH PRIVATE KEY-----\nZXhhbXBsZQ==\n-----END OPENSSH PRIVATE KEY-----"
341                .len()
342        );
343        assert_eq!(
344            findings[0].redacted,
345            "-----BEGIN OPENSSH PRIVATE KEY----- …"
346        );
347    }
348
349    #[test]
350    fn scan_content_covers_redaction_only_token_shapes() {
351        let findings = scan_content(
352            "Authorization: Bearer abcDEFghi123_-+/=xyz\njwt=eyJabcd.eyJefgh.signature_pad\n",
353        );
354        let detectors = findings
355            .iter()
356            .map(|finding| finding.detector.as_str())
357            .collect::<BTreeSet<_>>();
358        assert!(detectors.contains("bearer-token"));
359        assert!(detectors.contains("jwt-token"));
360    }
361
362    #[tokio::test(flavor = "current_thread")]
363    async fn append_secret_scan_audit_writes_redacted_event() {
364        let log = MemoryEventLog::new(32);
365        let findings = scan_content(r#"token = "sk-abcdefghijklmnopqrstuvwx123456""#);
366        append_secret_scan_audit(&log, "test.secret_scan", 44, &findings)
367            .await
368            .unwrap();
369
370        let topic = Topic::new(SECRET_SCAN_AUDIT_TOPIC).unwrap();
371        let events = log.read_range(&topic, None, 10).await.unwrap();
372        assert_eq!(events.len(), 1);
373        assert_eq!(events[0].1.kind, "scan_detected");
374        assert_eq!(events[0].1.payload["caller"], "test.secret_scan");
375        let redacted = events[0].1.payload["findings"][0]["redacted"]
376            .as_str()
377            .unwrap();
378        assert!(redacted.contains('…'));
379        assert!(!redacted.contains("abcdefghijklmnopqrstuvwx123456"));
380    }
381}