Skip to main content

php_lsp/
cache.rs

1//! Persistent on-disk cache for Phase K.
2//!
3//! The cache stores a serialized `FileIndex` per PHP file, keyed on
4//! `(uri, content)`.  On a warm start `scan_workspace` reads the cached index
5//! instead of parsing the file, shrinking cold-start I/O from O(parse) to
6//! O(read + bincode-decode) — roughly 10–50× faster per file.
7//!
8//! ## Layout
9//!
10//! ```text
11//! ~/.cache/php-lsp/<schema-version>/<workspace-hash>/<entry-hash>.bin
12//! ```
13//!
14//! - `<schema-version>` — `php-lsp` crate version; bumping it rotates the
15//!   entire cache so old entries are never decoded against a newer schema.
16//! - `<workspace-hash>` — blake3 of the canonicalized absolute path of the
17//!   first workspace root, truncated to 16 hex chars. Two separate projects
18//!   get isolated caches; two checkouts of the same project at the same
19//!   absolute path share one.
20//! - `<entry-hash>` — blake3 of the bytes `uri || 0x00 || content`, truncated
21//!   to 32 hex chars. Editing a file changes the content → new key → cache
22//!   miss; a different file at the same URI also gets a different key.
23//!
24//! ## Format
25//!
26//! `bincode` v2 (binary, fast, schema-stable via serde derives on
27//! `FileIndex` et al). Files are written atomically via a temp-file rename
28//! to avoid half-written entries on an interrupted shutdown.
29//!
30//! ## Invalidation
31//!
32//! Rotating the schema version invalidates everything; rotating the content
33//! invalidates one file. There's no LRU or cleanup yet — Step 2 will add a
34//! size cap + orphan sweep.
35
36use std::io::{self, Write};
37use std::path::{Path, PathBuf};
38
39use serde::{Serialize, de::DeserializeOwned};
40
41/// Identifies a single cache entry. Opaque — callers produce it via
42/// [`WorkspaceCache::key_for`] and pass it straight back to read/write.
43#[derive(Debug, Clone, PartialEq, Eq, Hash)]
44pub struct CacheKey(String);
45
46impl CacheKey {
47    fn as_filename(&self) -> &str {
48        &self.0
49    }
50}
51
52/// Handle to the cache directory for a single workspace. Construction is
53/// cheap (creates directories on demand); the same handle can be shared
54/// across threads via `Arc` — it holds no mutable state.
55#[derive(Debug, Clone)]
56pub struct WorkspaceCache {
57    dir: PathBuf,
58}
59
60/// Size cap (bytes) for a single workspace's cache directory. At
61/// startup, if the directory exceeds this, we reset it — simpler than
62/// LRU eviction and the rebuild cost is bounded (it's just the next
63/// workspace scan running as if cold). 512 MiB fits a mega-workspace
64/// (50 k files × ~10 KB average `FileIndex`) with headroom and is
65/// small enough that no reasonable disk will choke on it.
66pub const CACHE_SIZE_CAP: u64 = 512 * 1024 * 1024;
67
68impl WorkspaceCache {
69    /// Create (or re-open) the cache directory for a workspace rooted at
70    /// `root`. Returns `None` when the system has no usable home/cache
71    /// directory — callers should treat that as "cache disabled" and
72    /// proceed without persistence.
73    ///
74    /// If the existing cache directory exceeds [`CACHE_SIZE_CAP`], it is
75    /// cleared before the handle is returned. That's a coarse knob —
76    /// K3 could refine to LRU-by-mtime — but crossing 512 MiB at
77    /// startup indicates the workspace has churned through many
78    /// content hashes and the rebuild cost is bounded to one full
79    /// re-scan.
80    pub fn new(root: &Path) -> Option<Self> {
81        let base = cache_base_dir()?;
82        let schema = schema_version();
83        let workspace = workspace_hash(root);
84        let dir = base.join("php-lsp").join(schema).join(workspace);
85        std::fs::create_dir_all(&dir).ok()?;
86        let cache = Self { dir };
87        if cache.size_bytes().unwrap_or(0) > CACHE_SIZE_CAP {
88            let _ = cache.clear();
89        }
90        Some(cache)
91    }
92
93    /// The filesystem path of this workspace's cache directory.
94    pub fn cache_dir(&self) -> &std::path::Path {
95        &self.dir
96    }
97
98    /// Total bytes consumed by `.bin` entries in this workspace's cache
99    /// directory. Cheap (one `read_dir` pass, no recursion into
100    /// subdirectories because the layout is flat).
101    pub fn size_bytes(&self) -> io::Result<u64> {
102        let mut total = 0u64;
103        let entries = match std::fs::read_dir(&self.dir) {
104            Ok(e) => e,
105            Err(e) if e.kind() == io::ErrorKind::NotFound => return Ok(0),
106            Err(e) => return Err(e),
107        };
108        for entry in entries.flatten() {
109            let meta = match entry.metadata() {
110                Ok(m) => m,
111                Err(_) => continue,
112            };
113            if meta.is_file() {
114                total = total.saturating_add(meta.len());
115            }
116        }
117        Ok(total)
118    }
119
120    /// Override the root directory directly. Intended for tests; the
121    /// directory is used verbatim (no schema / workspace subdirectories
122    /// are appended).
123    #[cfg(test)]
124    pub fn with_dir(dir: PathBuf) -> Self {
125        Self { dir }
126    }
127
128    /// Build a cache key from file content. Combines `uri` and `content`
129    /// so that two files with identical content but different URIs get
130    /// different keys. Used in tests and legacy code paths.
131    pub fn key_for(uri: &str, content: &str) -> CacheKey {
132        let mut hasher = blake3::Hasher::new();
133        hasher.update(uri.as_bytes());
134        hasher.update(&[0u8]);
135        hasher.update(content.as_bytes());
136        let full = hasher.finalize().to_hex();
137        CacheKey(full.as_str()[..32].to_string())
138    }
139
140    /// Build a cache key from file metadata instead of content.
141    ///
142    /// Hashing the full file content costs ~1 ms/file CPU on warm starts,
143    /// nearly cancelling the parse savings. An mtime+size key is O(constant)
144    /// per file and is invalidated automatically when the file changes.
145    ///
146    /// Tradeoffs vs content hash: `touch` without an edit invalidates the
147    /// entry (safe cache miss, re-parses once) and clock skew on network
148    /// mounts can in theory produce a stale hit. Both are acceptable for a
149    /// developer tool where a spurious miss is safe.
150    pub fn key_for_stat(uri: &str, mtime_secs: u64, size: u64) -> CacheKey {
151        let mut hasher = blake3::Hasher::new();
152        hasher.update(uri.as_bytes());
153        hasher.update(&[1u8]); // distinct domain from key_for
154        hasher.update(&mtime_secs.to_le_bytes());
155        hasher.update(&size.to_le_bytes());
156        let full = hasher.finalize().to_hex();
157        CacheKey(full.as_str()[..32].to_string())
158    }
159
160    /// Deserialize a previously-cached value. Returns `None` on any I/O
161    /// or decode failure — a corrupted entry should look identical to a
162    /// missing one so callers fall through to the recompute path.
163    pub fn read<T: DeserializeOwned>(&self, key: &CacheKey) -> Option<T> {
164        let path = self.path_for(key);
165        let bytes = std::fs::read(&path).ok()?;
166        let config = bincode::config::standard();
167        bincode::serde::decode_from_slice(&bytes, config)
168            .ok()
169            .map(|(v, _len)| v)
170    }
171
172    /// Atomically publish an entry to the cache. Writes to a sibling
173    /// temp file then renames, so readers never see a half-written
174    /// payload even if the process dies mid-write.
175    ///
176    /// No fsync: the cache is advisory-only — a crash that loses a write
177    /// just produces a cache miss on the next startup, which safely falls
178    /// back to re-parsing. Skipping sync_all() avoids 5–15 ms per file on
179    /// macOS, which on a 1,500-file project accounts for most of the cold
180    /// indexing time.
181    pub fn write<T: Serialize>(&self, key: &CacheKey, value: &T) -> io::Result<()> {
182        let path = self.path_for(key);
183        let tmp = path.with_extension("tmp");
184        let config = bincode::config::standard();
185        let bytes = bincode::serde::encode_to_vec(value, config)
186            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
187        {
188            let mut f = std::fs::File::create(&tmp)?;
189            f.write_all(&bytes)?;
190        }
191        std::fs::rename(&tmp, &path)?;
192        Ok(())
193    }
194
195    /// Drop every entry in this workspace's cache. Safe to call while
196    /// other threads are reading — individual `read` calls that race
197    /// with a `clear` will see `None` rather than garbage, and the next
198    /// `write` recreates the entry.
199    pub fn clear(&self) -> io::Result<()> {
200        if self.dir.exists() {
201            std::fs::remove_dir_all(&self.dir)?;
202            std::fs::create_dir_all(&self.dir)?;
203        }
204        Ok(())
205    }
206
207    fn path_for(&self, key: &CacheKey) -> PathBuf {
208        self.dir.join(format!("{}.bin", key.as_filename()))
209    }
210}
211
212/// Platform cache directory: `$XDG_CACHE_HOME` or `$HOME/.cache` on Unix,
213/// `%LOCALAPPDATA%` on Windows. Deliberately doesn't depend on the `dirs`
214/// crate — keeps the footprint small and the behaviour predictable.
215fn cache_base_dir() -> Option<PathBuf> {
216    if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME")
217        && !xdg.is_empty()
218    {
219        return Some(PathBuf::from(xdg));
220    }
221    if cfg!(windows) {
222        if let Some(local) = std::env::var_os("LOCALAPPDATA")
223            && !local.is_empty()
224        {
225            return Some(PathBuf::from(local));
226        }
227    } else if let Some(home) = std::env::var_os("HOME")
228        && !home.is_empty()
229    {
230        return Some(PathBuf::from(home).join(".cache"));
231    }
232    None
233}
234
235/// Bump this constant (and the matching literal in [`schema_version`]) when
236/// `FileIndex` or any type it contains gains, loses, or renames a field.
237/// Rotating it causes every cached entry to be treated as a miss on the next
238/// cold start, regardless of whether the crate version changed.
239pub const FILE_INDEX_SCHEMA: &str = "fi-v1";
240
241/// Schema marker: bumping `php-lsp` crate version, `mir-codebase` version,
242/// or [`FILE_INDEX_SCHEMA`] invalidates every cached entry. The hardcoded mir
243/// version is a trade-off: keeping it in source means we don't depend on
244/// `build.rs` introspection, at the cost of needing to remember to update it
245/// alongside `Cargo.toml`.
246///
247/// **Important**: the `"fi-v1"` literal here must stay in sync with
248/// `FILE_INDEX_SCHEMA`. `concat!` requires a literal — bump both together.
249fn schema_version() -> &'static str {
250    concat!(env!("CARGO_PKG_VERSION"), "-mir-0.7-fi-v1")
251}
252
253fn workspace_hash(root: &Path) -> String {
254    let canonical = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
255    let hex = blake3::hash(canonical.as_os_str().as_encoded_bytes()).to_hex();
256    hex.as_str()[..16].to_string()
257}
258
259#[cfg(test)]
260mod tests {
261    use super::*;
262    use tempfile::TempDir;
263
264    #[derive(Serialize, serde::Deserialize, PartialEq, Debug)]
265    struct SamplePayload {
266        name: String,
267        values: Vec<u32>,
268    }
269
270    #[test]
271    fn key_for_is_deterministic_per_uri_and_content() {
272        let k1 = WorkspaceCache::key_for("file:///a.php", "<?php echo 1;");
273        let k2 = WorkspaceCache::key_for("file:///a.php", "<?php echo 1;");
274        assert_eq!(k1, k2);
275    }
276
277    #[test]
278    fn key_for_differs_when_content_differs() {
279        let k1 = WorkspaceCache::key_for("file:///a.php", "<?php echo 1;");
280        let k2 = WorkspaceCache::key_for("file:///a.php", "<?php echo 2;");
281        assert_ne!(k1, k2);
282    }
283
284    #[test]
285    fn key_for_differs_when_uri_differs() {
286        // Same content, different URI — the separator byte prevents
287        // (uri_a || content_b) from colliding with (uri_a+b || content).
288        let k1 = WorkspaceCache::key_for("file:///a.php", "<?php");
289        let k2 = WorkspaceCache::key_for("file:///b.php", "<?php");
290        assert_ne!(k1, k2);
291    }
292
293    #[test]
294    fn write_then_read_round_trips() {
295        let dir = TempDir::new().unwrap();
296        let cache = WorkspaceCache::with_dir(dir.path().to_path_buf());
297        let key = WorkspaceCache::key_for("file:///x.php", "<?php");
298        let payload = SamplePayload {
299            name: "x".into(),
300            values: vec![1, 2, 3],
301        };
302        cache.write(&key, &payload).unwrap();
303        let decoded: SamplePayload = cache.read(&key).unwrap();
304        assert_eq!(decoded, payload);
305    }
306
307    #[test]
308    fn read_returns_none_for_missing_key() {
309        let dir = TempDir::new().unwrap();
310        let cache = WorkspaceCache::with_dir(dir.path().to_path_buf());
311        let missing = WorkspaceCache::key_for("file:///nope.php", "");
312        let decoded: Option<SamplePayload> = cache.read(&missing);
313        assert!(decoded.is_none());
314    }
315
316    #[test]
317    fn read_returns_none_for_corrupted_entry() {
318        let dir = TempDir::new().unwrap();
319        let cache = WorkspaceCache::with_dir(dir.path().to_path_buf());
320        let key = WorkspaceCache::key_for("file:///c.php", "<?php");
321        // Write garbage bytes directly into the slot the cache would use.
322        std::fs::write(cache.path_for(&key), b"not valid bincode").unwrap();
323        let decoded: Option<SamplePayload> = cache.read(&key);
324        assert!(
325            decoded.is_none(),
326            "corrupted entry must look missing, not panic"
327        );
328    }
329
330    #[test]
331    fn write_is_atomic_via_rename() {
332        // If the write path didn't go through a temp file, a crash
333        // mid-`write_all` could leave a half-written `.bin`. We can't
334        // easily simulate a crash, but we can at least assert the
335        // temp-file doesn't linger on success.
336        let dir = TempDir::new().unwrap();
337        let cache = WorkspaceCache::with_dir(dir.path().to_path_buf());
338        let key = WorkspaceCache::key_for("file:///atomic.php", "<?php");
339        let payload = SamplePayload {
340            name: "a".into(),
341            values: vec![],
342        };
343        cache.write(&key, &payload).unwrap();
344        let tmp = cache.path_for(&key).with_extension("tmp");
345        assert!(!tmp.exists(), "tmp file should be removed by rename");
346    }
347
348    #[test]
349    fn clear_drops_all_entries() {
350        let dir = TempDir::new().unwrap();
351        let cache = WorkspaceCache::with_dir(dir.path().to_path_buf());
352        for i in 0..3 {
353            let k = WorkspaceCache::key_for(&format!("file:///c{i}.php"), "");
354            cache
355                .write(
356                    &k,
357                    &SamplePayload {
358                        name: i.to_string(),
359                        values: vec![],
360                    },
361                )
362                .unwrap();
363        }
364        cache.clear().unwrap();
365        for i in 0..3 {
366            let k = WorkspaceCache::key_for(&format!("file:///c{i}.php"), "");
367            let decoded: Option<SamplePayload> = cache.read(&k);
368            assert!(decoded.is_none());
369        }
370    }
371
372    #[test]
373    fn size_bytes_sums_flat_bin_files() {
374        let dir = TempDir::new().unwrap();
375        let cache = WorkspaceCache::with_dir(dir.path().to_path_buf());
376        assert_eq!(cache.size_bytes().unwrap(), 0);
377
378        let key1 = WorkspaceCache::key_for("file:///s1.php", "<?php");
379        cache
380            .write(
381                &key1,
382                &SamplePayload {
383                    name: "s1".into(),
384                    values: vec![0u32; 16],
385                },
386            )
387            .unwrap();
388        let key2 = WorkspaceCache::key_for("file:///s2.php", "<?php");
389        cache
390            .write(
391                &key2,
392                &SamplePayload {
393                    name: "s2".into(),
394                    values: vec![0u32; 16],
395                },
396            )
397            .unwrap();
398
399        let total = cache.size_bytes().unwrap();
400        let expected1 = cache.path_for(&key1).metadata().unwrap().len();
401        let expected2 = cache.path_for(&key2).metadata().unwrap().len();
402        assert_eq!(total, expected1 + expected2);
403    }
404
405    #[test]
406    fn file_index_round_trips() {
407        use crate::ast::ParsedDoc;
408        use crate::file_index::FileIndex;
409
410        let dir = TempDir::new().unwrap();
411        let cache = WorkspaceCache::with_dir(dir.path().to_path_buf());
412        let src = "<?php\nnamespace App;\nclass Foo { public function bar(): string {} }";
413        let key = WorkspaceCache::key_for("file:///Foo.php", src);
414
415        let doc = ParsedDoc::parse(src.to_string());
416        let index = FileIndex::extract(&doc);
417        cache.write(&key, &index).unwrap();
418
419        let decoded: FileIndex = cache.read(&key).unwrap();
420        assert_eq!(decoded.namespace.as_deref(), Some("App"));
421        assert_eq!(decoded.classes.len(), 1);
422        assert_eq!(decoded.classes[0].name.as_ref(), "Foo");
423        assert_eq!(decoded.classes[0].methods.len(), 1);
424        assert_eq!(decoded.classes[0].methods[0].name.as_ref(), "bar");
425    }
426}