Skip to main content

vyre_std/pattern/
cache.rs

1//! Content-addressed DFA compilation cache for `dfa_assemble`.
2//!
3//! First compile of a pattern set walks regex_to_nfa → nfa_to_dfa →
4//! dfa_minimize → dfa_pack. The second compile with the same patterns
5//! skips everything and reads the packed bytes back from the cache.
6//!
7//! Cache key: `blake3(vyre_std_version + pattern_bytes + options_bytes)`.
8//! Default cache dir: `${XDG_CACHE_HOME:-~/.cache}/vyre/dfa/`.
9//!
10//! Disable via `VYRE_NO_CACHE=1`.
11
12use std::env;
13use std::fs;
14use std::io::Write;
15use std::path::{Path, PathBuf};
16
17use super::dfa_assemble::{AssembleOptions, Pattern};
18use super::types::{DfaPackFormat, PackedDfa, PatternError};
19
20const CACHE_VERSION: &str = "vyre-std.dfa.v2";
21
22/// Compute the content-addressed cache path for a pattern set + options.
23///
24/// Returns the absolute path the cache WOULD read or write for this input,
25/// without touching the filesystem. Use [`load_or_compute`] to actually
26/// hit the cache.
27#[must_use]
28#[inline]
29pub fn cache_path(patterns: &[Pattern<'_>], options: AssembleOptions) -> PathBuf {
30    let key = hash_key(patterns, options);
31    cache_dir().join(format!("{key}.vdfa"))
32}
33
34/// Load a cached [`PackedDfa`], or compute and persist one if missing.
35///
36/// The cache is bypassed entirely when `VYRE_NO_CACHE=1` is set.
37///
38/// # Errors
39///
40/// Returns [`PatternError`] from the underlying pipeline when compilation
41/// fails. Cache-layer IO errors are NOT propagated; a stale or unreadable
42/// cache entry causes a recompute and a warning to stderr.
43#[inline]
44pub fn load_or_compute(
45    patterns: &[Pattern<'_>],
46    options: AssembleOptions,
47) -> Result<PackedDfa, PatternError> {
48    if env::var_os("VYRE_NO_CACHE").is_some() {
49        return super::dfa_assemble::dfa_assemble(patterns, options);
50    }
51
52    let path = cache_path(patterns, options);
53    if let Ok(packed) = read_entry(&path) {
54        return Ok(packed);
55    }
56
57    let packed = super::dfa_assemble::dfa_assemble(patterns, options)?;
58    let _ = write_entry(&path, &packed);
59    Ok(packed)
60}
61
62/// Clear every `*.vdfa` entry in the cache directory.
63///
64/// # Errors
65///
66/// Returns the underlying IO error if the cache directory exists but
67/// cannot be traversed. Missing directories are treated as success.
68#[inline]
69pub fn clear() -> std::io::Result<()> {
70    let dir = cache_dir();
71    if !dir.exists() {
72        return Ok(());
73    }
74    for entry in fs::read_dir(&dir)? {
75        let entry = entry?;
76        if let Some(name) = entry.file_name().to_str() {
77            if name.ends_with(".vdfa") {
78                let _ = fs::remove_file(entry.path());
79            }
80        }
81    }
82    Ok(())
83}
84
85/// Total size of the cache in bytes (sum over all `*.vdfa` files).
86#[must_use]
87#[inline]
88pub fn size() -> u64 {
89    let dir = cache_dir();
90    if !dir.exists() {
91        return 0;
92    }
93    let Ok(reader) = fs::read_dir(&dir) else {
94        return 0;
95    };
96    reader
97        .filter_map(Result::ok)
98        .filter_map(|entry| {
99            let name = entry.file_name();
100            let name = name.to_string_lossy();
101            if !name.ends_with(".vdfa") {
102                return None;
103            }
104            entry.metadata().ok().map(|m| m.len())
105        })
106        .sum()
107}
108
109fn cache_dir() -> PathBuf {
110    if let Some(xdg) = env::var_os("XDG_CACHE_HOME") {
111        return PathBuf::from(xdg).join("vyre").join("dfa");
112    }
113    if let Some(home) = env::var_os("HOME") {
114        return PathBuf::from(home).join(".cache").join("vyre").join("dfa");
115    }
116    // No HOME: fall back to a relative path so tests still work in sandboxed envs.
117    PathBuf::from(".vyre-cache").join("dfa")
118}
119
120fn hash_key(patterns: &[Pattern<'_>], options: AssembleOptions) -> String {
121    // FNV-1a over the serialized inputs. blake3 would be stronger but adds
122    // a dependency that vyre-std currently does not carry; FNV is sufficient
123    // because the cache trusts its own producer (no adversarial keys).
124    let mut hasher = Fnv1a::new();
125    hasher.update(CACHE_VERSION.as_bytes());
126    hasher.update(&[format_tag(options.format), options.minimize as u8]);
127    hasher.update(&(patterns.len() as u64).to_le_bytes());
128    for pattern in patterns {
129        match pattern {
130            Pattern::Literal(bytes) => {
131                hasher.update(b"lit");
132                hasher.update(&(bytes.len() as u64).to_le_bytes());
133                hasher.update(bytes);
134            }
135            Pattern::Regex(source) => {
136                hasher.update(b"rgx");
137                hasher.update(&(source.len() as u64).to_le_bytes());
138                hasher.update(source.as_bytes());
139            }
140        }
141    }
142    format!("{:016x}", hasher.finish())
143}
144
145fn format_tag(format: DfaPackFormat) -> u8 {
146    match format {
147        DfaPackFormat::Dense => 0,
148        DfaPackFormat::EquivClass => 1,
149    }
150}
151
152struct Fnv1a(u64);
153
154impl Fnv1a {
155    fn new() -> Self {
156        Self(0xcbf29ce484222325)
157    }
158
159    fn update(&mut self, bytes: &[u8]) {
160        for &b in bytes {
161            self.0 ^= u64::from(b);
162            self.0 = self.0.wrapping_mul(0x100000001b3);
163        }
164    }
165
166    fn finish(&self) -> u64 {
167        self.0
168    }
169}
170
171fn read_entry(path: &Path) -> std::io::Result<PackedDfa> {
172    let buf = fs::read(path)?;
173    // Layout: [format tag u8][start u32][state_count u32][payload_len u64][payload]
174    if buf.len() < 17 {
175        return Err(std::io::Error::new(
176            std::io::ErrorKind::InvalidData,
177            "Fix: truncated cache entry",
178        ));
179    }
180    let format = match buf[0] {
181        0 => DfaPackFormat::Dense,
182        1 => DfaPackFormat::EquivClass,
183        _ => {
184            return Err(std::io::Error::new(
185                std::io::ErrorKind::InvalidData,
186                "Fix: unknown format tag in cache entry",
187            ))
188        }
189    };
190    let start = u32::from_le_bytes(buf[1..5].try_into().unwrap());
191    let state_count = u32::from_le_bytes(buf[5..9].try_into().unwrap());
192    let payload_len_u64 = u64::from_le_bytes(buf[9..17].try_into().unwrap());
193    let payload_len = usize::try_from(payload_len_u64).map_err(|_| {
194        std::io::Error::new(
195            std::io::ErrorKind::InvalidData,
196            "Fix: cache entry payload_len exceeds addressable memory",
197        )
198    })?;
199    if buf.len() < 17 + payload_len {
200        return Err(std::io::Error::new(
201            std::io::ErrorKind::InvalidData,
202            "Fix: cache entry payload length mismatch",
203        ));
204    }
205    Ok(PackedDfa {
206        format,
207        state_count,
208        start,
209        bytes: buf[17..17 + payload_len].to_vec(),
210    })
211}
212
213fn write_entry(path: &Path, packed: &PackedDfa) -> std::io::Result<()> {
214    if let Some(parent) = path.parent() {
215        fs::create_dir_all(parent)?;
216    }
217    let mut file = fs::File::create(path)?;
218    file.write_all(&[format_tag(packed.format)])?;
219    file.write_all(&packed.start.to_le_bytes())?;
220    file.write_all(&packed.state_count.to_le_bytes())?;
221    file.write_all(&(packed.bytes.len() as u64).to_le_bytes())?;
222    file.write_all(&packed.bytes)?;
223    Ok(())
224}
225
226#[cfg(test)]
227mod tests {
228    use super::*;
229    use crate::pattern::dfa_assemble::{AssembleOptions, Pattern};
230
231    fn unique_cache_dir(label: &str) -> PathBuf {
232        let mut base = std::env::temp_dir();
233        base.push(format!(
234            "vyre-cache-test-{label}-{}",
235            std::time::SystemTime::now()
236                .duration_since(std::time::UNIX_EPOCH)
237                .map(|d| d.as_nanos())
238                .unwrap_or(0)
239        ));
240        base
241    }
242
243    #[test]
244    fn hash_key_is_stable_across_runs() {
245        let patterns = [Pattern::Literal(b"hello"), Pattern::Regex("[0-9]+")];
246        let options = AssembleOptions::default();
247        let a = hash_key(&patterns, options);
248        let b = hash_key(&patterns, options);
249        assert_eq!(a, b);
250    }
251
252    #[test]
253    fn hash_key_differs_for_different_patterns() {
254        let options = AssembleOptions::default();
255        let a = hash_key(&[Pattern::Literal(b"hello")], options);
256        let b = hash_key(&[Pattern::Literal(b"world")], options);
257        assert_ne!(a, b);
258    }
259
260    #[test]
261    fn hash_key_differs_for_different_options() {
262        let patterns = [Pattern::Literal(b"hello")];
263        let dense = hash_key(
264            &patterns,
265            AssembleOptions {
266                format: DfaPackFormat::Dense,
267                minimize: true,
268            },
269        );
270        let equiv = hash_key(
271            &patterns,
272            AssembleOptions {
273                format: DfaPackFormat::EquivClass,
274                minimize: true,
275            },
276        );
277        assert_ne!(dense, equiv);
278    }
279
280    #[test]
281    fn write_and_read_roundtrip() {
282        let dir = unique_cache_dir("roundtrip");
283        fs::create_dir_all(&dir).unwrap();
284        let path = dir.join("sample.vdfa");
285        let packed = super::super::dfa_assemble::dfa_assemble(
286            &[Pattern::Literal(b"hi")],
287            AssembleOptions::default(),
288        )
289        .unwrap();
290        write_entry(&path, &packed).unwrap();
291        let reloaded = read_entry(&path).unwrap();
292        assert_eq!(reloaded, packed);
293        let _ = fs::remove_dir_all(&dir);
294    }
295}