Skip to main content

php_lsp/
cache.rs

1//! Persistent on-disk cache for Phase K.
2//!
3//! **Status: infrastructure layer only.** This module exposes the primitives
4//! — directory layout, content hashing, serde round-trip — that a later
5//! commit will wire into `scan_workspace` to skip re-parsing on warm start.
6//! Nothing in `backend.rs` / `document_store.rs` consumes it yet.
7//!
8//! ## Layout
9//!
10//! ```text
11//! ~/.cache/php-lsp/<schema-version>/<workspace-hash>/<entry-hash>.bin
12//! ```
13//!
14//! - `<schema-version>` — `php-lsp` crate version concatenated with the
15//!   `mir-codebase` version (the latter owns `StubSlice`'s schema, so
16//!   bumping either rotates the cache).
17//! - `<workspace-hash>` — blake3 of the canonicalized absolute path of the
18//!   first workspace root, truncated to 16 hex chars. Two separate projects
19//!   get isolated caches; two checkouts of the same project at the same
20//!   absolute path share one.
21//! - `<entry-hash>` — blake3 of the bytes `uri || 0x00 || content`, truncated
22//!   to 32 hex chars. Editing a file changes the content → new key → cache
23//!   miss; a different file at the same URI also gets a different key.
24//!
25//! ## Format
26//!
27//! `bincode` v2 (binary, fast, schema-stable via serde derives on
28//! `StubSlice` et al). Files are written atomically via a temp-file rename
29//! to avoid half-written entries on an interrupted shutdown.
30//!
31//! ## Invalidation
32//!
33//! Rotating the schema version invalidates everything; rotating the content
34//! invalidates one file. There's no LRU or cleanup yet — Step 2 will add a
35//! size cap + orphan sweep.
36
37use std::io::{self, Write};
38use std::path::{Path, PathBuf};
39
40use serde::{Serialize, de::DeserializeOwned};
41
42/// Identifies a single cache entry. Opaque — callers produce it via
43/// [`WorkspaceCache::key_for`] and pass it straight back to read/write.
44#[derive(Debug, Clone, PartialEq, Eq, Hash)]
45pub struct CacheKey(String);
46
47impl CacheKey {
48    fn as_filename(&self) -> &str {
49        &self.0
50    }
51}
52
53/// Handle to the cache directory for a single workspace. Construction is
54/// cheap (creates directories on demand); the same handle can be shared
55/// across threads via `Arc` — it holds no mutable state.
56#[derive(Debug, Clone)]
57pub struct WorkspaceCache {
58    dir: PathBuf,
59}
60
61/// Size cap (bytes) for a single workspace's cache directory. At
62/// startup, if the directory exceeds this, we reset it — simpler than
63/// LRU eviction and the rebuild cost is bounded (it's just the next
64/// workspace scan running as if cold). 512 MiB fits a mega-workspace
65/// (50 k files × ~10 KB average `StubSlice`) with headroom and is
66/// small enough that no reasonable disk will choke on it.
67pub const CACHE_SIZE_CAP: u64 = 512 * 1024 * 1024;
68
69impl WorkspaceCache {
70    /// Create (or re-open) the cache directory for a workspace rooted at
71    /// `root`. Returns `None` when the system has no usable home/cache
72    /// directory — callers should treat that as "cache disabled" and
73    /// proceed without persistence.
74    ///
75    /// If the existing cache directory exceeds [`CACHE_SIZE_CAP`], it is
76    /// cleared before the handle is returned. That's a coarse knob —
77    /// K3 could refine to LRU-by-mtime — but crossing 512 MiB at
78    /// startup indicates the workspace has churned through many
79    /// content hashes and the rebuild cost is bounded to one full
80    /// re-scan.
81    pub fn new(root: &Path) -> Option<Self> {
82        let base = cache_base_dir()?;
83        let schema = schema_version();
84        let workspace = workspace_hash(root);
85        let dir = base.join("php-lsp").join(schema).join(workspace);
86        std::fs::create_dir_all(&dir).ok()?;
87        let cache = Self { dir };
88        if cache.size_bytes().unwrap_or(0) > CACHE_SIZE_CAP {
89            let _ = cache.clear();
90        }
91        Some(cache)
92    }
93
94    /// Total bytes consumed by `.bin` entries in this workspace's cache
95    /// directory. Cheap (one `read_dir` pass, no recursion into
96    /// subdirectories because the layout is flat).
97    pub fn size_bytes(&self) -> io::Result<u64> {
98        let mut total = 0u64;
99        let entries = match std::fs::read_dir(&self.dir) {
100            Ok(e) => e,
101            Err(e) if e.kind() == io::ErrorKind::NotFound => return Ok(0),
102            Err(e) => return Err(e),
103        };
104        for entry in entries.flatten() {
105            let meta = match entry.metadata() {
106                Ok(m) => m,
107                Err(_) => continue,
108            };
109            if meta.is_file() {
110                total = total.saturating_add(meta.len());
111            }
112        }
113        Ok(total)
114    }
115
116    /// Override the root directory directly. Intended for tests; the
117    /// directory is used verbatim (no schema / workspace subdirectories
118    /// are appended).
119    #[cfg(test)]
120    pub fn with_dir(dir: PathBuf) -> Self {
121        Self { dir }
122    }
123
124    /// Build a cache key for a single file. Combines `uri` and `content`
125    /// so that two files with identical content but different URIs get
126    /// different keys (StubSlice bakes `file` into its payload).
127    pub fn key_for(uri: &str, content: &str) -> CacheKey {
128        let mut hasher = blake3::Hasher::new();
129        hasher.update(uri.as_bytes());
130        hasher.update(&[0u8]);
131        hasher.update(content.as_bytes());
132        let full = hasher.finalize().to_hex();
133        // 32 hex chars = 128 bits, ample collision resistance for
134        // workspaces with millions of files (birthday bound ≫ 10^18).
135        CacheKey(full.as_str()[..32].to_string())
136    }
137
138    /// Deserialize a previously-cached value. Returns `None` on any I/O
139    /// or decode failure — a corrupted entry should look identical to a
140    /// missing one so callers fall through to the recompute path.
141    pub fn read<T: DeserializeOwned>(&self, key: &CacheKey) -> Option<T> {
142        let path = self.path_for(key);
143        let bytes = std::fs::read(&path).ok()?;
144        let config = bincode::config::standard();
145        bincode::serde::decode_from_slice(&bytes, config)
146            .ok()
147            .map(|(v, _len)| v)
148    }
149
150    /// Atomically publish an entry to the cache. Writes to a sibling
151    /// temp file then renames, so readers never see a half-written
152    /// payload even if the process dies mid-write.
153    pub fn write<T: Serialize>(&self, key: &CacheKey, value: &T) -> io::Result<()> {
154        let path = self.path_for(key);
155        let tmp = path.with_extension("tmp");
156        let config = bincode::config::standard();
157        let bytes = bincode::serde::encode_to_vec(value, config)
158            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
159        {
160            let mut f = std::fs::File::create(&tmp)?;
161            f.write_all(&bytes)?;
162            f.sync_all()?;
163        }
164        std::fs::rename(&tmp, &path)?;
165        Ok(())
166    }
167
168    /// Drop every entry in this workspace's cache. Safe to call while
169    /// other threads are reading — individual `read` calls that race
170    /// with a `clear` will see `None` rather than garbage, and the next
171    /// `write` recreates the entry.
172    pub fn clear(&self) -> io::Result<()> {
173        if self.dir.exists() {
174            std::fs::remove_dir_all(&self.dir)?;
175            std::fs::create_dir_all(&self.dir)?;
176        }
177        Ok(())
178    }
179
180    fn path_for(&self, key: &CacheKey) -> PathBuf {
181        self.dir.join(format!("{}.bin", key.as_filename()))
182    }
183}
184
185/// Platform cache directory: `$XDG_CACHE_HOME` or `$HOME/.cache` on Unix,
186/// `%LOCALAPPDATA%` on Windows. Deliberately doesn't depend on the `dirs`
187/// crate — keeps the footprint small and the behaviour predictable.
188fn cache_base_dir() -> Option<PathBuf> {
189    if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME")
190        && !xdg.is_empty()
191    {
192        return Some(PathBuf::from(xdg));
193    }
194    if cfg!(windows) {
195        if let Some(local) = std::env::var_os("LOCALAPPDATA")
196            && !local.is_empty()
197        {
198            return Some(PathBuf::from(local));
199        }
200    } else if let Some(home) = std::env::var_os("HOME")
201        && !home.is_empty()
202    {
203        return Some(PathBuf::from(home).join(".cache"));
204    }
205    None
206}
207
208/// Schema marker: bumping either `php-lsp` or `mir-codebase` invalidates
209/// every cached entry. The hardcoded mir version is a trade-off: keeping
210/// it in source means we don't depend on `build.rs` introspection, at the
211/// cost of needing to remember to update it alongside `Cargo.toml`. A
212/// compile-time assert in the serialize/deserialize path could catch
213/// drift — deferred to Step 2.
214fn schema_version() -> &'static str {
215    concat!(env!("CARGO_PKG_VERSION"), "-mir-0.7")
216}
217
218fn workspace_hash(root: &Path) -> String {
219    let canonical = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
220    let hex = blake3::hash(canonical.as_os_str().as_encoded_bytes()).to_hex();
221    hex.as_str()[..16].to_string()
222}
223
224#[cfg(test)]
225mod tests {
226    use super::*;
227    use tempfile::TempDir;
228
229    #[derive(Serialize, serde::Deserialize, PartialEq, Debug)]
230    struct SamplePayload {
231        name: String,
232        values: Vec<u32>,
233    }
234
235    #[test]
236    fn key_for_is_deterministic_per_uri_and_content() {
237        let k1 = WorkspaceCache::key_for("file:///a.php", "<?php echo 1;");
238        let k2 = WorkspaceCache::key_for("file:///a.php", "<?php echo 1;");
239        assert_eq!(k1, k2);
240    }
241
242    #[test]
243    fn key_for_differs_when_content_differs() {
244        let k1 = WorkspaceCache::key_for("file:///a.php", "<?php echo 1;");
245        let k2 = WorkspaceCache::key_for("file:///a.php", "<?php echo 2;");
246        assert_ne!(k1, k2);
247    }
248
249    #[test]
250    fn key_for_differs_when_uri_differs() {
251        // Same content, different URI — the separator byte prevents
252        // (uri_a || content_b) from colliding with (uri_a+b || content).
253        let k1 = WorkspaceCache::key_for("file:///a.php", "<?php");
254        let k2 = WorkspaceCache::key_for("file:///b.php", "<?php");
255        assert_ne!(k1, k2);
256    }
257
258    #[test]
259    fn write_then_read_round_trips() {
260        let dir = TempDir::new().unwrap();
261        let cache = WorkspaceCache::with_dir(dir.path().to_path_buf());
262        let key = WorkspaceCache::key_for("file:///x.php", "<?php");
263        let payload = SamplePayload {
264            name: "x".into(),
265            values: vec![1, 2, 3],
266        };
267        cache.write(&key, &payload).unwrap();
268        let decoded: SamplePayload = cache.read(&key).unwrap();
269        assert_eq!(decoded, payload);
270    }
271
272    #[test]
273    fn read_returns_none_for_missing_key() {
274        let dir = TempDir::new().unwrap();
275        let cache = WorkspaceCache::with_dir(dir.path().to_path_buf());
276        let missing = WorkspaceCache::key_for("file:///nope.php", "");
277        let decoded: Option<SamplePayload> = cache.read(&missing);
278        assert!(decoded.is_none());
279    }
280
281    #[test]
282    fn read_returns_none_for_corrupted_entry() {
283        let dir = TempDir::new().unwrap();
284        let cache = WorkspaceCache::with_dir(dir.path().to_path_buf());
285        let key = WorkspaceCache::key_for("file:///c.php", "<?php");
286        // Write garbage bytes directly into the slot the cache would use.
287        std::fs::write(cache.path_for(&key), b"not valid bincode").unwrap();
288        let decoded: Option<SamplePayload> = cache.read(&key);
289        assert!(
290            decoded.is_none(),
291            "corrupted entry must look missing, not panic"
292        );
293    }
294
295    #[test]
296    fn write_is_atomic_via_rename() {
297        // If the write path didn't go through a temp file, a crash
298        // mid-`write_all` could leave a half-written `.bin`. We can't
299        // easily simulate a crash, but we can at least assert the
300        // temp-file doesn't linger on success.
301        let dir = TempDir::new().unwrap();
302        let cache = WorkspaceCache::with_dir(dir.path().to_path_buf());
303        let key = WorkspaceCache::key_for("file:///atomic.php", "<?php");
304        let payload = SamplePayload {
305            name: "a".into(),
306            values: vec![],
307        };
308        cache.write(&key, &payload).unwrap();
309        let tmp = cache.path_for(&key).with_extension("tmp");
310        assert!(!tmp.exists(), "tmp file should be removed by rename");
311    }
312
313    #[test]
314    fn clear_drops_all_entries() {
315        let dir = TempDir::new().unwrap();
316        let cache = WorkspaceCache::with_dir(dir.path().to_path_buf());
317        for i in 0..3 {
318            let k = WorkspaceCache::key_for(&format!("file:///c{i}.php"), "");
319            cache
320                .write(
321                    &k,
322                    &SamplePayload {
323                        name: i.to_string(),
324                        values: vec![],
325                    },
326                )
327                .unwrap();
328        }
329        cache.clear().unwrap();
330        for i in 0..3 {
331            let k = WorkspaceCache::key_for(&format!("file:///c{i}.php"), "");
332            let decoded: Option<SamplePayload> = cache.read(&k);
333            assert!(decoded.is_none());
334        }
335    }
336
337    #[test]
338    fn size_bytes_sums_flat_bin_files() {
339        let dir = TempDir::new().unwrap();
340        let cache = WorkspaceCache::with_dir(dir.path().to_path_buf());
341        assert_eq!(cache.size_bytes().unwrap(), 0);
342
343        let key1 = WorkspaceCache::key_for("file:///s1.php", "<?php");
344        cache
345            .write(
346                &key1,
347                &SamplePayload {
348                    name: "s1".into(),
349                    values: vec![0u32; 16],
350                },
351            )
352            .unwrap();
353        let key2 = WorkspaceCache::key_for("file:///s2.php", "<?php");
354        cache
355            .write(
356                &key2,
357                &SamplePayload {
358                    name: "s2".into(),
359                    values: vec![0u32; 16],
360                },
361            )
362            .unwrap();
363
364        let total = cache.size_bytes().unwrap();
365        let expected1 = cache.path_for(&key1).metadata().unwrap().len();
366        let expected2 = cache.path_for(&key2).metadata().unwrap().len();
367        assert_eq!(total, expected1 + expected2);
368    }
369
370    #[test]
371    fn stub_slice_round_trips() {
372        // Smoke-test the real payload shape Phase K Step 2 will cache:
373        // mir_codebase::StubSlice already derives Serialize/Deserialize.
374        let dir = TempDir::new().unwrap();
375        let cache = WorkspaceCache::with_dir(dir.path().to_path_buf());
376        let key = WorkspaceCache::key_for("file:///stub.php", "<?php class Foo {}");
377        let slice = mir_codebase::storage::StubSlice::default();
378        cache.write(&key, &slice).unwrap();
379        let decoded: mir_codebase::storage::StubSlice = cache.read(&key).unwrap();
380        // StubSlice has no PartialEq, so we compare a cheap proxy:
381        // the class count (0 for a default).
382        assert_eq!(decoded.classes.len(), slice.classes.len());
383    }
384}