envseal 0.3.14

Write-only secret vault with process-level access control — post-agent secret management
//! Confidential audit-log payloads.
//!
//! Pre-0.3.13 the audit log stored every event flattened in plaintext —
//! `binary`, `secret`, `signal_id`, etc. visible to anyone with read
//! access on `<vault_root>/audit.log`. The chain hash gave integrity
//! but not confidentiality, the same gap that motivated the
//! [`crate::vault::sealed_blob`] redesign for `policy.toml` /
//! `security_config.toml`.
//!
//! This module wraps every event payload with AES-256-GCM under a
//! per-vault audit key. The chain is computed over the *ciphertext*
//! line, so a forensic verifier can still walk the chain and detect
//! tampering without holding the key. Only an unlocker on the same
//! device (where `audit.key` is hardware-sealed via the DPAPI / Secure
//! Enclave / TPM 2.0 backend) can decrypt event bodies.
//!
//! # Domain separation
//!
//! Encryption uses [`crate::vault::sealed_blob::seal`] with domain
//! `b"audit_event.v1"`. The audit key is independent from the master
//! key (a cleartext-on-disk audit-key file does not let an attacker
//! decrypt secrets), so we don't need the vault to be unlocked to
//! append an event — important because audit logs are written from
//! signal handlers and pre-unlock paths.
//!
//! # Hardware backing
//!
//! The 32-byte audit key is wrapped by [`crate::vault::hardware`] just
//! like the master key. On `Backend::None` the wrap is a passthrough
//! (consistent with the master-key v1 fallback the user has already
//! opted into via `ENVSEAL_ACCEPT_NO_HARDWARE_SEAL`).

use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::sync::{Mutex, OnceLock};

use rand::{rngs::OsRng, RngCore};
use zeroize::Zeroizing;

use crate::error::Error;
use crate::vault::hardware::{self, DeviceKeystore};
use crate::vault::sealed_blob;

/// HKDF / sealed-blob domain for individual audit-event payloads.
const EVENT_DOMAIN: &[u8] = b"audit_event.v1";

/// Hardware-sealed audit key file relative to the vault root.
const AUDIT_KEY_FILENAME: &str = "audit.key";

/// 4-byte magic on the audit key file. Distinct from the master-key
/// magic so a confused operator who copies `master.key` into
/// `audit.key` (or vice versa) sees a precise refusal.
const AUDIT_KEY_MAGIC: [u8; 4] = *b"EAK1";

/// Hard cap on the audit-key file size. The legitimate file is
/// magic (4 B) + v2 header (9 B) + sealed payload (≤ a few hundred
/// bytes for any backend). 4 KiB leaves room for any future
/// envelope growth without admitting a denial-of-service via a
/// multi-gigabyte `audit.key` planted by a hostile filesystem.
const AUDIT_KEY_MAX_BYTES: u64 = 4 * 1024;

/// Hard cap on a single hex-encoded sealed event passed back to
/// [`decrypt_event`]. 1 MiB is comfortably larger than any
/// legitimate audit payload (events are bounded enums with short
/// string fields) and prevents an attacker who can write garbage
/// into `audit.log` from forcing the reader to allocate
/// gigabytes during a forensic walk.
const SEALED_HEX_MAX_BYTES: usize = 1024 * 1024;

/// Per-vault audit-key cache. Values are wrapped in [`Zeroizing`]
/// so a long-running process drops the in-memory key bytes on
/// `HashMap` eviction (or process exit) instead of leaving them in
/// allocator slop where a `ReadProcessMemory`-style attacker can
/// recover them after the cache entry is "gone". `[u8; 32]` is
/// `Copy`, so the wrapper turns each cache slot into a stack
/// scrubbed-on-drop value rather than the previous bare copy that
/// stayed in heap memory until the `HashMap` reused the slot.
fn cache() -> &'static Mutex<HashMap<PathBuf, Zeroizing<[u8; 32]>>> {
    static CACHE: OnceLock<Mutex<HashMap<PathBuf, Zeroizing<[u8; 32]>>>> = OnceLock::new();
    CACHE.get_or_init(|| Mutex::new(HashMap::new()))
}

/// Process-wide serialization for `audit.key` generation. Two
/// threads racing through `load_or_generate_key` for the same fresh
/// vault root would both observe `path.exists() == false`, both
/// generate a fresh 32-byte key, and the second would fail with
/// `AlreadyExists` on the exclusive `create_new(true)` of the
/// `.tmp` file — silently dropping the audit append. With this
/// mutex the second thread blocks, then sees the file exist, and
/// loads the winner's key.
fn generation_lock() -> &'static Mutex<()> {
    static LOCK: OnceLock<Mutex<()>> = OnceLock::new();
    LOCK.get_or_init(|| Mutex::new(()))
}

fn cache_key(root: &Path) -> PathBuf {
    std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf())
}

/// Encrypt a canonical event-JSON payload under the per-vault audit
/// key. Output is the lowercase-hex encoding of the AES-256-GCM
/// sealed blob, suitable for embedding in a JSON log line.
///
/// # Errors
/// Propagates [`Error::CryptoFailure`] from the sealed-blob layer (KDF
/// or AEAD failure) or [`Error::AuditLogFailed`] if the audit key
/// cannot be loaded or generated.
pub fn encrypt_event(root: &Path, plaintext: &[u8]) -> Result<String, Error> {
    let key = load_or_generate_key(root)?;
    let sealed = sealed_blob::seal(plaintext, &key, EVENT_DOMAIN)?;
    Ok(crate::hex::encode(sealed))
}

/// Inverse of [`encrypt_event`]. The hex string came from a log line
/// previously written by this same vault; on a different machine
/// (where the hardware seal of `audit.key` cannot be unwrapped) this
/// will return [`Error::AuditLogFailed`].
///
/// # Errors
/// `Error::AuditLogFailed` if the hex is malformed or the key cannot
/// be loaded; `Error::CryptoFailure` on AEAD failure.
pub fn decrypt_event(root: &Path, hex_ct: &str) -> Result<Zeroizing<Vec<u8>>, Error> {
    if hex_ct.len() > SEALED_HEX_MAX_BYTES {
        return Err(Error::AuditLogFailed(format!(
            "audit log: sealed event hex is {} bytes, exceeds {} cap",
            hex_ct.len(),
            SEALED_HEX_MAX_BYTES
        )));
    }
    let sealed = crate::hex::decode(hex_ct).ok_or_else(|| {
        Error::AuditLogFailed("audit log: malformed hex in sealed event".to_string())
    })?;
    let key = load_or_generate_key(root)?;
    // sealed_blob::unseal already wraps the AEAD output in
    // Zeroizing<Vec<u8>>; pass it through directly so the wrapper's
    // scrub-on-drop semantics are preserved end-to-end.
    sealed_blob::unseal(&sealed, &key, EVENT_DOMAIN)
}

/// Load the cached per-vault audit key, or load it from disk, or
/// generate-and-persist a fresh one if the file does not yet exist.
///
/// The returned key is held in a [`Zeroizing`] wrapper so a copy
/// taken out of the cache for one encryption is wiped on drop.
fn load_or_generate_key(root: &Path) -> Result<Zeroizing<[u8; 32]>, Error> {
    let key_id = cache_key(root);
    if let Some(cached) = {
        let map = cache()
            .lock()
            .unwrap_or_else(std::sync::PoisonError::into_inner);
        map.get(&key_id).map(|z| **z)
    } {
        return Ok(Zeroizing::new(cached));
    }

    let path = root.join(AUDIT_KEY_FILENAME);

    // Serialize generation across threads in this process. Then
    // re-check existence under the lock so the second thread to
    // arrive on a fresh vault loads the winner's key instead of
    // racing into `generate_and_persist_key` and failing on the
    // exclusive `.tmp` create.
    let _gen_guard = generation_lock()
        .lock()
        .unwrap_or_else(std::sync::PoisonError::into_inner);
    let key = if path.exists() {
        load_key_from_disk(&path)?
    } else {
        match generate_and_persist_key(&path) {
            Ok(k) => k,
            Err(_) if path.exists() => {
                // Cross-process race: another envseal instance won
                // the create-new on the tmp file and renamed it into
                // place between our exists() check and our write.
                // Load that file instead of bubbling AlreadyExists.
                load_key_from_disk(&path)?
            }
            Err(e) => return Err(e),
        }
    };

    let mut map = cache()
        .lock()
        .unwrap_or_else(std::sync::PoisonError::into_inner);
    map.insert(key_id, Zeroizing::new(*key));
    Ok(key)
}

fn load_key_from_disk(path: &Path) -> Result<Zeroizing<[u8; 32]>, Error> {
    use std::io::Read;
    // TOCTOU-free: open via the no-traverse helper, then fstat the
    // resulting handle for size and read from it. Without this, an
    // attacker who plants a symlink between verify_not_symlink and
    // a follow-up `std::fs::read(path)` would have redirected the
    // load to attacker-controlled content.
    let file = crate::file::atomic_open::open_read_no_traverse(path)?;
    let len = file
        .metadata()
        .map_err(|e| Error::AuditLogFailed(format!("stat audit.key: {e}")))?
        .len();
    if len > AUDIT_KEY_MAX_BYTES {
        return Err(Error::AuditLogFailed(format!(
            "audit.key is {len} bytes, exceeds {AUDIT_KEY_MAX_BYTES} cap — refusing to load",
        )));
    }
    let mut buf = Vec::with_capacity(usize::try_from(len).unwrap_or(0));
    file.take(AUDIT_KEY_MAX_BYTES)
        .read_to_end(&mut buf)
        .map_err(|e| Error::AuditLogFailed(format!("read audit.key: {e}")))?;
    if buf.len() < 4 || buf[..4] != AUDIT_KEY_MAGIC {
        return Err(Error::AuditLogFailed(
            "audit.key: missing magic — file corrupted or replaced".to_string(),
        ));
    }
    let envelope = hardware::parse_v2(&buf[4..])
        .map_err(|e| Error::AuditLogFailed(format!("audit.key envelope parse failed: {e}")))?;
    let keystore = DeviceKeystore::select();
    if envelope.backend != keystore.backend() {
        return Err(Error::AuditLogFailed(format!(
            "audit.key was sealed by {} but this device offers {}\
             refusing to load (probably copied between machines)",
            envelope.backend.name(),
            keystore.backend().name()
        )));
    }
    // Hold the unsealed bytes in Zeroizing so the heap allocation
    // is wiped on drop even on the error paths below — without this,
    // a wrong-length keystore output would leave 32+ bytes of
    // sensitive material in a leaked Vec until allocator reuse.
    let unwrapped = Zeroizing::new(
        keystore
            .unseal(envelope.sealed)
            .map_err(|e| Error::AuditLogFailed(format!("audit.key unseal failed: {e}")))?,
    );
    if unwrapped.len() != 32 {
        return Err(Error::AuditLogFailed(format!(
            "audit.key plaintext is {} bytes, expected 32",
            unwrapped.len()
        )));
    }
    let mut key = [0u8; 32];
    key.copy_from_slice(&unwrapped);
    Ok(Zeroizing::new(key))
}

fn generate_and_persist_key(path: &Path) -> Result<Zeroizing<[u8; 32]>, Error> {
    let mut key = [0u8; 32];
    OsRng.fill_bytes(&mut key);

    let keystore = DeviceKeystore::select();
    let sealed = keystore
        .seal(&key)
        .map_err(|e| Error::AuditLogFailed(format!("audit.key seal failed: {e}")))?;
    let envelope = hardware::pack_v2(keystore.backend(), &sealed);
    let mut out = Vec::with_capacity(4 + envelope.len());
    out.extend_from_slice(&AUDIT_KEY_MAGIC);
    out.extend_from_slice(&envelope);

    write_atomic_secret(path, &out)?;
    Ok(Zeroizing::new(key))
}

#[cfg(unix)]
fn write_atomic_secret(path: &Path, bytes: &[u8]) -> Result<(), Error> {
    use std::io::Write;
    use std::os::unix::fs::OpenOptionsExt;
    let tmp = path.with_extension("key.tmp");
    let write_then_rename = || -> Result<(), Error> {
        let mut opts = std::fs::OpenOptions::new();
        opts.write(true)
            .create_new(true)
            .mode(0o600)
            .custom_flags(libc::O_NOFOLLOW | libc::O_CLOEXEC);
        let mut f = opts
            .open(&tmp)
            .map_err(|e| Error::AuditLogFailed(format!("create audit.key.tmp: {e}")))?;
        f.write_all(bytes)
            .map_err(|e| Error::AuditLogFailed(format!("write audit.key.tmp: {e}")))?;
        f.sync_all().ok();
        std::fs::rename(&tmp, path)
            .map_err(|e| Error::AuditLogFailed(format!("rename audit.key: {e}")))?;
        // fsync the parent dir so the rename's new dirent survives
        // a crash. On ext4/xfs/zfs the rename may otherwise be
        // visible only after the next implicit metadata flush,
        // which can leave a fresh `audit.key` without its dirent
        // if power is yanked between rename and flush.
        if let Some(parent) = path.parent() {
            if let Ok(d) = std::fs::File::open(parent) {
                let _ = d.sync_all();
            }
        }
        Ok(())
    };
    let result = write_then_rename();
    // If any step failed, make sure we don't leave a 0o600 tmp
    // sitting in the vault directory between runs — a forensic
    // operator who sees `audit.key.tmp` without a corresponding
    // `audit.key` should be able to assume the previous attempt
    // crashed mid-write rather than wonder whether it's a partial
    // re-key.
    if result.is_err() {
        let _ = std::fs::remove_file(&tmp);
    }
    result
}

#[cfg(windows)]
fn write_atomic_secret(path: &Path, bytes: &[u8]) -> Result<(), Error> {
    use std::io::Write;
    let tmp = path.with_extension("key.tmp");
    {
        let mut f = std::fs::OpenOptions::new()
            .write(true)
            .create_new(true)
            .open(&tmp)
            .map_err(|e| Error::AuditLogFailed(format!("create audit.key.tmp: {e}")))?;
        f.write_all(bytes)
            .map_err(|e| Error::AuditLogFailed(format!("write audit.key.tmp: {e}")))?;
        f.sync_all().ok();
    }
    // Lock the tmp file down before rename so there is no window
    // where another process can open it under the inherited (often
    // permissive) parent-directory ACL. Best-effort: a rare DACL
    // failure on a non-NTFS volume is not worth refusing to write
    // the audit key, but it IS worth ensuring the file is removed
    // from the file system rather than left behind with default
    // permissions.
    if let Err(e) = crate::policy::windows_acl::set_owner_only_dacl(&tmp) {
        let _ = std::fs::remove_file(&tmp);
        return Err(Error::AuditLogFailed(format!(
            "lock audit.key.tmp DACL: {e}"
        )));
    }
    std::fs::rename(&tmp, path)
        .map_err(|e| Error::AuditLogFailed(format!("rename audit.key: {e}")))?;
    // Re-apply on the destination — rename preserves the source
    // ACL on NTFS, but be defensive in case anything in the chain
    // (e.g. a filter driver) reset it.
    let _ = crate::policy::windows_acl::set_owner_only_dacl(path);
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn roundtrips_event_payload() {
        let dir = tempfile::tempdir().unwrap();
        let payload = b"{\"event\":\"x\",\"binary\":\"/usr/bin/cat\"}";
        let ct = encrypt_event(dir.path(), payload).unwrap();
        let pt = decrypt_event(dir.path(), &ct).unwrap();
        assert_eq!(pt.as_slice(), payload);
    }

    #[test]
    fn ciphertext_does_not_contain_plaintext_substrings() {
        let dir = tempfile::tempdir().unwrap();
        let payload = b"{\"binary\":\"/usr/bin/recognizable-target\"}";
        let ct = encrypt_event(dir.path(), payload).unwrap();
        assert!(!ct.contains("recognizable-target"));
    }

    #[test]
    fn second_encrypt_uses_persisted_key() {
        let dir = tempfile::tempdir().unwrap();
        let p1 = b"first";
        let _ = encrypt_event(dir.path(), p1).unwrap();
        // Drop the in-process cache so the second call is forced to
        // re-load from disk; if the on-disk key didn't persist the
        // ciphertext from call 1 wouldn't decrypt.
        cache()
            .lock()
            .unwrap_or_else(std::sync::PoisonError::into_inner)
            .clear();
        let ct = encrypt_event(dir.path(), p1).unwrap();
        let pt = decrypt_event(dir.path(), &ct).unwrap();
        assert_eq!(pt.as_slice(), p1);
    }

    #[test]
    fn malformed_hex_rejected() {
        let dir = tempfile::tempdir().unwrap();
        // Force the key to exist so decrypt doesn't generate one.
        let _ = encrypt_event(dir.path(), b"x").unwrap();
        assert!(decrypt_event(dir.path(), "not-hex-zz").is_err());
    }
}