harn-vm 0.8.37

Async bytecode virtual machine for the Harn programming language
Documentation
use std::collections::{BTreeMap, BTreeSet};
use std::sync::LazyLock;

use regex::Regex;
use serde::{Deserialize, Serialize};
use sha2::Digest;

use crate::event_log::{active_event_log, EventLog, LogEvent, Topic};
use crate::secret_patterns::{SecretPatternSpec, DEFAULT_SECRET_PATTERN_SPECS};
use crate::value::{VmError, VmValue};
use crate::vm::Vm;

pub const SECRET_SCAN_AUDIT_TOPIC: &str = "audit.secret_scan";
const HIGH_ENTROPY_THRESHOLD: f64 = 3.5;

#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SecretFinding {
    pub detector: String,
    pub source: String,
    pub title: String,
    pub line: usize,
    pub column_start: usize,
    pub column_end: usize,
    pub start_offset: usize,
    pub end_offset: usize,
    pub redacted: String,
    pub fingerprint: String,
}

struct SecretRule {
    spec: &'static SecretPatternSpec,
    regex: Regex,
}

static SECRET_RULES: LazyLock<Vec<SecretRule>> = LazyLock::new(|| {
    DEFAULT_SECRET_PATTERN_SPECS
        .iter()
        .map(|spec| SecretRule {
            spec,
            regex: Regex::new(spec.regex).unwrap_or_else(|error| {
                panic!("invalid {} secret scan regex: {error}", spec.detector)
            }),
        })
        .collect()
});

static HIGH_ENTROPY_ASSIGNMENT_RULE: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(
        r#"(?im)(?:secret|token|api[_-]?key|access[_-]?key|password|passwd|pwd|client[_-]?secret|private[_-]?key)[^\n:=]{0,32}(?::|=)\s*["']([A-Za-z0-9+/=_\.-]{20,})["']"#,
    )
    .unwrap()
});

pub fn scan_content(content: &str) -> Vec<SecretFinding> {
    let line_starts = line_starts(content);
    let mut findings = Vec::new();

    for rule in SECRET_RULES.iter() {
        for mat in rule.regex.find_iter(content) {
            findings.push(build_finding(
                content,
                &line_starts,
                rule.spec.detector,
                rule.spec.source,
                rule.spec.title,
                mat.start(),
                mat.end(),
                mat.as_str(),
            ));
        }
    }

    for captures in HIGH_ENTROPY_ASSIGNMENT_RULE.captures_iter(content) {
        let Some(secret) = captures.get(1) else {
            continue;
        };
        if shannon_entropy(secret.as_str()) < HIGH_ENTROPY_THRESHOLD {
            continue;
        }
        findings.push(build_finding(
            content,
            &line_starts,
            "high-entropy-credential-assignment",
            "trufflehog",
            "High-entropy secret assignment",
            secret.start(),
            secret.end(),
            secret.as_str(),
        ));
    }

    findings.sort_by(|left, right| {
        left.start_offset
            .cmp(&right.start_offset)
            .then(left.end_offset.cmp(&right.end_offset))
            .then(left.detector.cmp(&right.detector))
    });
    let specific_spans: BTreeSet<(usize, usize)> = findings
        .iter()
        .filter(|finding| finding.detector != "high-entropy-credential-assignment")
        .map(|finding| (finding.start_offset, finding.end_offset))
        .collect();
    findings.retain(|finding| {
        finding.detector != "high-entropy-credential-assignment"
            || !specific_spans.contains(&(finding.start_offset, finding.end_offset))
    });
    findings.dedup_by(|left, right| {
        left.detector == right.detector
            && left.start_offset == right.start_offset
            && left.end_offset == right.end_offset
    });
    findings
}

pub async fn append_secret_scan_audit<L: EventLog + ?Sized>(
    event_log: &L,
    caller: &str,
    content_len: usize,
    findings: &[SecretFinding],
) -> Result<(), crate::event_log::LogError> {
    let payload = serde_json::json!({
        "caller": caller,
        "content_len": content_len,
        "finding_count": findings.len(),
        "clean": findings.is_empty(),
        "findings": findings
            .iter()
            .map(|finding| {
                serde_json::json!({
                    "detector": finding.detector,
                    "source": finding.source,
                    "title": finding.title,
                    "line": finding.line,
                    "column_start": finding.column_start,
                    "column_end": finding.column_end,
                    "start_offset": finding.start_offset,
                    "end_offset": finding.end_offset,
                    "fingerprint": finding.fingerprint,
                    "redacted": finding.redacted,
                })
            })
            .collect::<Vec<_>>(),
        "observed_at": crate::orchestration::now_rfc3339(),
    });
    let topic = Topic::new(SECRET_SCAN_AUDIT_TOPIC).expect("secret scan audit topic is valid");
    let kind = if findings.is_empty() {
        "scan_clean"
    } else {
        "scan_detected"
    };
    event_log
        .append(&topic, LogEvent::new(kind, payload))
        .await?;
    Ok(())
}

pub async fn audit_secret_scan_active(
    caller: &str,
    content_len: usize,
    findings: &[SecretFinding],
) {
    emit_secret_scan_log(caller, content_len, findings);

    let Some(event_log) = active_event_log() else {
        return;
    };

    if let Err(error) =
        append_secret_scan_audit(event_log.as_ref(), caller, content_len, findings).await
    {
        crate::events::log_warn(
            "secret_scan.audit",
            &format!("failed to append secret scan audit event: {error}"),
        );
    }
}

pub(crate) fn register_secret_scan_builtins(vm: &mut Vm) {
    vm.register_async_builtin("secret_scan", |args| async move {
        let content = match args.first() {
            Some(VmValue::Nil) | None => {
                return Err(VmError::Runtime("secret_scan: content is required".into()));
            }
            Some(value) => value.display(),
        };

        let findings = scan_content(&content);
        audit_secret_scan_active("stdlib.secret_scan", content.len(), &findings).await;

        let value = serde_json::to_value(findings)
            .map_err(|error| VmError::Runtime(format!("secret_scan: {error}")))?;
        Ok(crate::schema::json_to_vm_value(&value))
    });
}

fn build_finding(
    content: &str,
    line_starts: &[usize],
    detector: &str,
    source: &str,
    title: &str,
    start_offset: usize,
    end_offset: usize,
    matched: &str,
) -> SecretFinding {
    let (line, column_start) = offset_to_line_col(content, line_starts, start_offset);
    let (_, column_end) = offset_to_line_col(content, line_starts, end_offset);
    SecretFinding {
        detector: detector.to_string(),
        source: source.to_string(),
        title: title.to_string(),
        line,
        column_start,
        column_end,
        start_offset,
        end_offset,
        redacted: redact_match(matched),
        fingerprint: fingerprint(matched),
    }
}

fn line_starts(content: &str) -> Vec<usize> {
    let mut starts = vec![0];
    for (index, byte) in content.bytes().enumerate() {
        if byte == b'\n' {
            starts.push(index + 1);
        }
    }
    starts
}

fn offset_to_line_col(content: &str, line_starts: &[usize], offset: usize) -> (usize, usize) {
    let line_index = line_starts
        .partition_point(|start| *start <= offset)
        .saturating_sub(1);
    let line_start = line_starts[line_index];
    let column = content[line_start..offset].chars().count() + 1;
    (line_index + 1, column)
}

fn redact_match(matched: &str) -> String {
    if matched.starts_with("-----BEGIN ") {
        return format!(
            "{}",
            matched
                .lines()
                .next()
                .unwrap_or("-----BEGIN PRIVATE KEY-----")
        );
    }

    let chars: Vec<char> = matched.chars().collect();
    if chars.len() <= 8 {
        return "*".repeat(chars.len());
    }
    let prefix: String = chars.iter().take(4).collect();
    let suffix: String = chars[chars.len().saturating_sub(4)..].iter().collect();
    format!("{prefix}{suffix}")
}

fn fingerprint(matched: &str) -> String {
    let hash = sha2::Sha256::digest(matched.as_bytes());
    let hex: String = hash.iter().map(|byte| format!("{byte:02x}")).collect();
    hex[..16].to_string()
}

fn shannon_entropy(value: &str) -> f64 {
    let mut counts = BTreeMap::new();
    for ch in value.chars() {
        *counts.entry(ch).or_insert(0usize) += 1;
    }
    let len = value.chars().count() as f64;
    counts
        .values()
        .map(|count| {
            let probability = *count as f64 / len;
            -(probability * probability.log2())
        })
        .sum()
}

fn emit_secret_scan_log(caller: &str, content_len: usize, findings: &[SecretFinding]) {
    let metadata = serde_json::json!({
        "topic": SECRET_SCAN_AUDIT_TOPIC,
        "caller": caller,
        "content_len": content_len,
        "finding_count": findings.len(),
        "clean": findings.is_empty(),
        "findings": findings
            .iter()
            .map(|finding| serde_json::json!({
                "detector": finding.detector,
                "source": finding.source,
                "line": finding.line,
                "fingerprint": finding.fingerprint,
                "redacted": finding.redacted,
            }))
            .collect::<Vec<_>>(),
    });
    let metadata = metadata
        .as_object()
        .cloned()
        .map(|object| object.into_iter().collect::<BTreeMap<_, _>>())
        .unwrap_or_default();
    crate::events::log_info_meta("secret_scan.audit", "secret scan completed", metadata);
}

#[cfg(test)]
mod tests {
    use super::*;

    use crate::event_log::{EventLog, MemoryEventLog};

    #[test]
    fn scan_content_detects_specific_rules_and_entropy_rule() {
        let findings = scan_content(
            r#"
github_token = "ghp_1234567890abcdefghijklmnopqrstuvwxyzAB"
config = { client_secret: "QWxhZGRpbjpPcGVuU2VzYW1lQWNjZXNzVG9rZW4=" }
"#,
        );

        assert!(findings
            .iter()
            .any(|finding| finding.detector == "github-token"));
        assert!(findings
            .iter()
            .any(|finding| finding.detector == "high-entropy-credential-assignment"));
    }

    #[test]
    fn scan_content_redacts_private_key_blocks() {
        let findings = scan_content(
            "-----BEGIN OPENSSH PRIVATE KEY-----\nZXhhbXBsZQ==\n-----END OPENSSH PRIVATE KEY-----\n",
        );
        assert_eq!(findings.len(), 1);
        assert_eq!(findings[0].detector, "private-key-block");
        assert_eq!(
            findings[0].end_offset - findings[0].start_offset,
            "-----BEGIN OPENSSH PRIVATE KEY-----\nZXhhbXBsZQ==\n-----END OPENSSH PRIVATE KEY-----"
                .len()
        );
        assert_eq!(
            findings[0].redacted,
            "-----BEGIN OPENSSH PRIVATE KEY----- …"
        );
    }

    #[test]
    fn scan_content_covers_redaction_only_token_shapes() {
        let findings = scan_content(
            "Authorization: Bearer abcDEFghi123_-+/=xyz\njwt=eyJabcd.eyJefgh.signature_pad\n",
        );
        let detectors = findings
            .iter()
            .map(|finding| finding.detector.as_str())
            .collect::<BTreeSet<_>>();
        assert!(detectors.contains("bearer-token"));
        assert!(detectors.contains("jwt-token"));
    }

    #[tokio::test(flavor = "current_thread")]
    async fn append_secret_scan_audit_writes_redacted_event() {
        let log = MemoryEventLog::new(32);
        let findings = scan_content(r#"token = "sk-abcdefghijklmnopqrstuvwx123456""#);
        append_secret_scan_audit(&log, "test.secret_scan", 44, &findings)
            .await
            .unwrap();

        let topic = Topic::new(SECRET_SCAN_AUDIT_TOPIC).unwrap();
        let events = log.read_range(&topic, None, 10).await.unwrap();
        assert_eq!(events.len(), 1);
        assert_eq!(events[0].1.kind, "scan_detected");
        assert_eq!(events[0].1.payload["caller"], "test.secret_scan");
        let redacted = events[0].1.payload["findings"][0]["redacted"]
            .as_str()
            .unwrap();
        assert!(redacted.contains(''));
        assert!(!redacted.contains("abcdefghijklmnopqrstuvwx123456"));
    }
}