Skip to main content

graphify_cache/
lib.rs

1//! SHA256-based semantic caching for graphify.
2//!
3//! Caches extraction results keyed by content hash so unchanged files are not
4//! re-processed.
5
6use std::fs;
7use std::path::Path;
8
9use serde::{Serialize, de::DeserializeOwned};
10use sha2::{Digest, Sha256};
11use thiserror::Error;
12use tracing::debug;
13
14/// Default cache directory relative to the working directory.
15const CACHE_DIR: &str = "graphify-out/cache";
16
17/// Errors from the cache layer.
18#[derive(Debug, Error)]
19pub enum CacheError {
20    #[error("IO error: {0}")]
21    Io(#[from] std::io::Error),
22
23    #[error("serialization error: {0}")]
24    Serde(#[from] serde_json::Error),
25}
26
27/// Compute the SHA256 hex digest of a file's content.
28///
29/// Returns `None` if the file cannot be read.
30pub fn file_hash(path: &Path) -> Option<String> {
31    let content = fs::read(path).ok()?;
32    let hash = Sha256::digest(&content);
33    Some(format!("{:x}", hash))
34}
35
36/// Build a cache filename from a file path relative to `root`.
37///
38/// The key is `{sha256}.json` where the hash is computed over the file content,
39/// so any change in content naturally invalidates the cache entry.
40fn cache_key(path: &Path, _root: &Path) -> String {
41    let hash = file_hash(path).unwrap_or_default();
42    format!("{hash}.json")
43}
44
45/// Load a cached extraction result for `path`, returning `None` on cache miss.
46///
47/// A cache miss occurs when:
48/// - The source file cannot be read (hash fails).
49/// - No cache entry exists for the current content hash.
50/// - The cached JSON cannot be deserialized into `T`.
51pub fn load_cached<T: DeserializeOwned>(path: &Path, root: &Path) -> Option<T> {
52    load_cached_from(path, root, Path::new(CACHE_DIR))
53}
54
55/// Like [`load_cached`] but with an explicit cache directory.
56pub fn load_cached_from<T: DeserializeOwned>(
57    path: &Path,
58    root: &Path,
59    cache_dir: &Path,
60) -> Option<T> {
61    let key = cache_key(path, root);
62    let cache_path = cache_dir.join(&key);
63    if !cache_path.exists() {
64        debug!(?cache_path, "cache miss");
65        return None;
66    }
67    let data = fs::read_to_string(&cache_path).ok()?;
68    serde_json::from_str(&data).ok()
69}
70
71/// Save an extraction result to cache.
72///
73/// Returns `true` on success, `false` on any I/O or serialization failure.
74pub fn save_cached<T: Serialize>(path: &Path, result: &T, root: &Path) -> bool {
75    save_cached_to(path, result, root, Path::new(CACHE_DIR))
76}
77
78/// Like [`save_cached`] but with an explicit cache directory.
79pub fn save_cached_to<T: Serialize>(
80    path: &Path,
81    result: &T,
82    root: &Path,
83    cache_dir: &Path,
84) -> bool {
85    let key = cache_key(path, root);
86    let cache_path = cache_dir.join(&key);
87
88    // Ensure the cache directory exists.
89    if let Some(parent) = cache_path.parent()
90        && fs::create_dir_all(parent).is_err()
91    {
92        return false;
93    }
94
95    // Atomic write: serialise → write to .tmp → rename into place.
96    let tmp = cache_path.with_extension("tmp");
97    match serde_json::to_string(result) {
98        Ok(json) => {
99            if fs::write(&tmp, &json).is_ok() {
100                debug!(?cache_path, "cache write");
101                fs::rename(&tmp, &cache_path).is_ok()
102            } else {
103                false
104            }
105        }
106        Err(_) => false,
107    }
108}
109
110/// Remove all cached files from the default cache directory.
111pub fn clear_cache() -> std::io::Result<()> {
112    clear_cache_dir(Path::new(CACHE_DIR))
113}
114
115/// Remove all cached files from the given cache directory.
116pub fn clear_cache_dir(cache_dir: &Path) -> std::io::Result<()> {
117    if cache_dir.exists() {
118        fs::remove_dir_all(cache_dir)?;
119    }
120    Ok(())
121}
122
123/// Invalidate the cache entry for a specific file.
124///
125/// Since caching is content-hash based, changing the file already causes a
126/// cache miss on the next read. This function pre-deletes entries matching
127/// the *current* content hash so stale data is cleaned up eagerly. It is a
128/// no-op when the file can't be read (already deleted, etc.).
129pub fn invalidate_cached(path: &Path, root: &Path, cache_dir: &Path) -> bool {
130    let key = cache_key(path, root);
131    let cache_path = cache_dir.join(&key);
132    if cache_path.exists() {
133        debug!(?cache_path, "cache invalidate");
134        fs::remove_file(&cache_path).is_ok()
135    } else {
136        true
137    }
138}
139
140#[cfg(test)]
141mod tests {
142    use super::*;
143    use serde::{Deserialize, Serialize};
144    use std::fs;
145    use tempfile::TempDir;
146
147    #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
148    struct DummyResult {
149        entities: Vec<String>,
150        score: f64,
151    }
152
153    fn make_dummy() -> DummyResult {
154        DummyResult {
155            entities: vec!["Alice".into(), "Bob".into()],
156            score: 0.95,
157        }
158    }
159
160    #[test]
161    fn test_file_hash_consistent() {
162        let dir = TempDir::new().unwrap();
163        let file = dir.path().join("hello.txt");
164        fs::write(&file, "hello world").unwrap();
165
166        let h1 = file_hash(&file).unwrap();
167        let h2 = file_hash(&file).unwrap();
168        assert_eq!(h1, h2, "hash must be deterministic");
169
170        // SHA256 of "hello world" is well-known.
171        assert_eq!(
172            h1,
173            "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
174        );
175    }
176
177    #[test]
178    fn test_file_hash_nonexistent() {
179        assert!(file_hash(Path::new("/no/such/file")).is_none());
180    }
181
182    #[test]
183    fn test_save_load_roundtrip() {
184        let dir = TempDir::new().unwrap();
185        let cache_dir = dir.path().join("cache");
186        let root = dir.path();
187
188        // Create a source file.
189        let src = dir.path().join("src.rs");
190        fs::write(&src, "fn main() {}").unwrap();
191
192        let value = make_dummy();
193        assert!(save_cached_to(&src, &value, root, &cache_dir));
194
195        let loaded: Option<DummyResult> = load_cached_from(&src, root, &cache_dir);
196        assert_eq!(loaded, Some(value));
197    }
198
199    #[test]
200    fn test_cache_miss_returns_none() {
201        let dir = TempDir::new().unwrap();
202        let cache_dir = dir.path().join("cache");
203        let root = dir.path();
204
205        let src = dir.path().join("not_cached.rs");
206        fs::write(&src, "let x = 1;").unwrap();
207
208        let loaded: Option<DummyResult> = load_cached_from(&src, root, &cache_dir);
209        assert!(loaded.is_none());
210    }
211
212    #[test]
213    fn test_content_change_invalidates_cache() {
214        let dir = TempDir::new().unwrap();
215        let cache_dir = dir.path().join("cache");
216        let root = dir.path();
217
218        let src = dir.path().join("mutable.rs");
219        fs::write(&src, "version 1").unwrap();
220
221        let value = make_dummy();
222        assert!(save_cached_to(&src, &value, root, &cache_dir));
223
224        // Mutate the source file — hash changes, old cache entry is stale.
225        fs::write(&src, "version 2").unwrap();
226
227        let loaded: Option<DummyResult> = load_cached_from(&src, root, &cache_dir);
228        assert!(loaded.is_none(), "modified file must not match old cache");
229    }
230
231    #[test]
232    fn test_clear_cache_removes_files() {
233        let dir = TempDir::new().unwrap();
234        let cache_dir = dir.path().join("cache");
235        let root = dir.path();
236
237        let src = dir.path().join("f.txt");
238        fs::write(&src, "data").unwrap();
239
240        assert!(save_cached_to(&src, &make_dummy(), root, &cache_dir));
241        assert!(cache_dir.exists());
242
243        clear_cache_dir(&cache_dir).unwrap();
244        assert!(!cache_dir.exists());
245    }
246}