vyre-std 0.1.0

Vyre standard library: GPU DFA assembly pipeline, Aho-Corasick construction, and compositional arithmetic helpers
Documentation
//! Content-addressed DFA compilation cache for `dfa_assemble`.
//!
//! First compile of a pattern set walks regex_to_nfa → nfa_to_dfa →
//! dfa_minimize → dfa_pack. The second compile with the same patterns
//! skips everything and reads the packed bytes back from the cache.
//!
//! Cache key: `blake3(vyre_std_version + pattern_bytes + options_bytes)`.
//! Default cache dir: `${XDG_CACHE_HOME:-~/.cache}/vyre/dfa/`.
//!
//! Disable via `VYRE_NO_CACHE=1`.

use std::env;
use std::fs;
use std::io::Write;
use std::path::{Path, PathBuf};

use super::dfa_assemble::{AssembleOptions, Pattern};
use super::types::{DfaPackFormat, PackedDfa, PatternError};

const CACHE_VERSION: &str = "vyre-std.dfa.v2";

/// Compute the content-addressed cache path for a pattern set + options.
///
/// Returns the absolute path the cache WOULD read or write for this input,
/// without touching the filesystem. Use [`load_or_compute`] to actually
/// hit the cache.
#[must_use]
#[inline]
pub fn cache_path(patterns: &[Pattern<'_>], options: AssembleOptions) -> PathBuf {
    let key = hash_key(patterns, options);
    cache_dir().join(format!("{key}.vdfa"))
}

/// Load a cached [`PackedDfa`], or compute and persist one if missing.
///
/// The cache is bypassed entirely when `VYRE_NO_CACHE=1` is set.
///
/// # Errors
///
/// Returns [`PatternError`] from the underlying pipeline when compilation
/// fails. Cache-layer IO errors are NOT propagated; a stale or unreadable
/// cache entry causes a recompute and a warning to stderr.
#[inline]
pub fn load_or_compute(
    patterns: &[Pattern<'_>],
    options: AssembleOptions,
) -> Result<PackedDfa, PatternError> {
    if env::var_os("VYRE_NO_CACHE").is_some() {
        return super::dfa_assemble::dfa_assemble(patterns, options);
    }

    let path = cache_path(patterns, options);
    if let Ok(packed) = read_entry(&path) {
        return Ok(packed);
    }

    let packed = super::dfa_assemble::dfa_assemble(patterns, options)?;
    let _ = write_entry(&path, &packed);
    Ok(packed)
}

/// Clear every `*.vdfa` entry in the cache directory.
///
/// # Errors
///
/// Returns the underlying IO error if the cache directory exists but
/// cannot be traversed. Missing directories are treated as success.
#[inline]
pub fn clear() -> std::io::Result<()> {
    let dir = cache_dir();
    if !dir.exists() {
        return Ok(());
    }
    for entry in fs::read_dir(&dir)? {
        let entry = entry?;
        if let Some(name) = entry.file_name().to_str() {
            if name.ends_with(".vdfa") {
                let _ = fs::remove_file(entry.path());
            }
        }
    }
    Ok(())
}

/// Total size of the cache in bytes (sum over all `*.vdfa` files).
#[must_use]
#[inline]
pub fn size() -> u64 {
    let dir = cache_dir();
    if !dir.exists() {
        return 0;
    }
    let Ok(reader) = fs::read_dir(&dir) else {
        return 0;
    };
    reader
        .filter_map(Result::ok)
        .filter_map(|entry| {
            let name = entry.file_name();
            let name = name.to_string_lossy();
            if !name.ends_with(".vdfa") {
                return None;
            }
            entry.metadata().ok().map(|m| m.len())
        })
        .sum()
}

fn cache_dir() -> PathBuf {
    if let Some(xdg) = env::var_os("XDG_CACHE_HOME") {
        return PathBuf::from(xdg).join("vyre").join("dfa");
    }
    if let Some(home) = env::var_os("HOME") {
        return PathBuf::from(home).join(".cache").join("vyre").join("dfa");
    }
    // No HOME: fall back to a relative path so tests still work in sandboxed envs.
    PathBuf::from(".vyre-cache").join("dfa")
}

fn hash_key(patterns: &[Pattern<'_>], options: AssembleOptions) -> String {
    // FNV-1a over the serialized inputs. blake3 would be stronger but adds
    // a dependency that vyre-std currently does not carry; FNV is sufficient
    // because the cache trusts its own producer (no adversarial keys).
    let mut hasher = Fnv1a::new();
    hasher.update(CACHE_VERSION.as_bytes());
    hasher.update(&[format_tag(options.format), options.minimize as u8]);
    hasher.update(&(patterns.len() as u64).to_le_bytes());
    for pattern in patterns {
        match pattern {
            Pattern::Literal(bytes) => {
                hasher.update(b"lit");
                hasher.update(&(bytes.len() as u64).to_le_bytes());
                hasher.update(bytes);
            }
            Pattern::Regex(source) => {
                hasher.update(b"rgx");
                hasher.update(&(source.len() as u64).to_le_bytes());
                hasher.update(source.as_bytes());
            }
        }
    }
    format!("{:016x}", hasher.finish())
}

fn format_tag(format: DfaPackFormat) -> u8 {
    match format {
        DfaPackFormat::Dense => 0,
        DfaPackFormat::EquivClass => 1,
    }
}

struct Fnv1a(u64);

impl Fnv1a {
    fn new() -> Self {
        Self(0xcbf29ce484222325)
    }

    fn update(&mut self, bytes: &[u8]) {
        for &b in bytes {
            self.0 ^= u64::from(b);
            self.0 = self.0.wrapping_mul(0x100000001b3);
        }
    }

    fn finish(&self) -> u64 {
        self.0
    }
}

fn read_entry(path: &Path) -> std::io::Result<PackedDfa> {
    let buf = fs::read(path)?;
    // Layout: [format tag u8][start u32][state_count u32][payload_len u64][payload]
    if buf.len() < 17 {
        return Err(std::io::Error::new(
            std::io::ErrorKind::InvalidData,
            "Fix: truncated cache entry",
        ));
    }
    let format = match buf[0] {
        0 => DfaPackFormat::Dense,
        1 => DfaPackFormat::EquivClass,
        _ => {
            return Err(std::io::Error::new(
                std::io::ErrorKind::InvalidData,
                "Fix: unknown format tag in cache entry",
            ))
        }
    };
    let start = u32::from_le_bytes(buf[1..5].try_into().unwrap());
    let state_count = u32::from_le_bytes(buf[5..9].try_into().unwrap());
    let payload_len_u64 = u64::from_le_bytes(buf[9..17].try_into().unwrap());
    let payload_len = usize::try_from(payload_len_u64).map_err(|_| {
        std::io::Error::new(
            std::io::ErrorKind::InvalidData,
            "Fix: cache entry payload_len exceeds addressable memory",
        )
    })?;
    if buf.len() < 17 + payload_len {
        return Err(std::io::Error::new(
            std::io::ErrorKind::InvalidData,
            "Fix: cache entry payload length mismatch",
        ));
    }
    Ok(PackedDfa {
        format,
        state_count,
        start,
        bytes: buf[17..17 + payload_len].to_vec(),
    })
}

fn write_entry(path: &Path, packed: &PackedDfa) -> std::io::Result<()> {
    if let Some(parent) = path.parent() {
        fs::create_dir_all(parent)?;
    }
    let mut file = fs::File::create(path)?;
    file.write_all(&[format_tag(packed.format)])?;
    file.write_all(&packed.start.to_le_bytes())?;
    file.write_all(&packed.state_count.to_le_bytes())?;
    file.write_all(&(packed.bytes.len() as u64).to_le_bytes())?;
    file.write_all(&packed.bytes)?;
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::pattern::dfa_assemble::{AssembleOptions, Pattern};

    fn unique_cache_dir(label: &str) -> PathBuf {
        let mut base = std::env::temp_dir();
        base.push(format!(
            "vyre-cache-test-{label}-{}",
            std::time::SystemTime::now()
                .duration_since(std::time::UNIX_EPOCH)
                .map(|d| d.as_nanos())
                .unwrap_or(0)
        ));
        base
    }

    #[test]
    fn hash_key_is_stable_across_runs() {
        let patterns = [Pattern::Literal(b"hello"), Pattern::Regex("[0-9]+")];
        let options = AssembleOptions::default();
        let a = hash_key(&patterns, options);
        let b = hash_key(&patterns, options);
        assert_eq!(a, b);
    }

    #[test]
    fn hash_key_differs_for_different_patterns() {
        let options = AssembleOptions::default();
        let a = hash_key(&[Pattern::Literal(b"hello")], options);
        let b = hash_key(&[Pattern::Literal(b"world")], options);
        assert_ne!(a, b);
    }

    #[test]
    fn hash_key_differs_for_different_options() {
        let patterns = [Pattern::Literal(b"hello")];
        let dense = hash_key(
            &patterns,
            AssembleOptions {
                format: DfaPackFormat::Dense,
                minimize: true,
            },
        );
        let equiv = hash_key(
            &patterns,
            AssembleOptions {
                format: DfaPackFormat::EquivClass,
                minimize: true,
            },
        );
        assert_ne!(dense, equiv);
    }

    #[test]
    fn write_and_read_roundtrip() {
        let dir = unique_cache_dir("roundtrip");
        fs::create_dir_all(&dir).unwrap();
        let path = dir.join("sample.vdfa");
        let packed = super::super::dfa_assemble::dfa_assemble(
            &[Pattern::Literal(b"hi")],
            AssembleOptions::default(),
        )
        .unwrap();
        write_entry(&path, &packed).unwrap();
        let reloaded = read_entry(&path).unwrap();
        assert_eq!(reloaded, packed);
        let _ = fs::remove_dir_all(&dir);
    }
}