Skip to main content

adler_core/
cache.rs

1//! Cross-run result cache.
2//!
3//! Re-running a scan minutes apart should not re-hit every site. The cache
4//! persists `Found` / `NotFound` verdicts keyed by `(site name, username)`
5//! and guarded by:
6//!
7//! - a **TTL**: entries older than the configured age are ignored (and
8//!   pruned on load), and
9//! - a **site signature**: a deterministic hash of the site's URL template
10//!   and signal list. If the site definition changes, its old cache entries
11//!   no longer match and are treated as misses.
12//!
13//! `Uncertain` outcomes are intentionally never cached — they're transient
14//! (rate limits, network blips) and caching them would freeze a temporary
15//! failure for the whole TTL window.
16//!
17//! Access pattern is bulk: [`Cache::load`] once at scan start, in-memory
18//! [`Cache::get`] / [`Cache::put`] during the scan, [`Cache::save`] once at
19//! the end. There are no concurrent disk writes, so a plain JSON file with
20//! an atomic temp-then-rename save is enough — no embedded database needed.
21
22use std::collections::HashMap;
23use std::path::{Path, PathBuf};
24use std::time::{Duration, SystemTime, UNIX_EPOCH};
25
26use serde::{Deserialize, Serialize};
27
28use crate::check::{CheckOutcome, MatchKind};
29use crate::error::Result;
30use crate::site::Site;
31use crate::username::Username;
32
33const CACHE_VERSION: u32 = 1;
34const FNV_OFFSET: u64 = 0xcbf2_9ce4_8422_2325;
35const FNV_PRIME: u64 = 0x0000_0100_0000_01b3;
36
37/// In-memory cache backed by a JSON file.
38#[derive(Debug)]
39pub struct Cache {
40    path: PathBuf,
41    ttl: Duration,
42    entries: HashMap<(String, String), Entry>,
43    dirty: bool,
44}
45
46#[derive(Debug, Clone)]
47struct Entry {
48    signature: u64,
49    stored_at: u64,
50    outcome: CheckOutcome,
51}
52
53#[derive(Serialize, Deserialize)]
54struct StoredEntry {
55    site: String,
56    username: String,
57    signature: u64,
58    stored_at: u64,
59    outcome: CheckOutcome,
60}
61
62#[derive(Serialize, Deserialize)]
63struct CacheFile {
64    version: u32,
65    entries: Vec<StoredEntry>,
66}
67
68impl Cache {
69    /// Load a cache from `path`, dropping entries older than `ttl`.
70    ///
71    /// Infallible: a missing, unreadable, or corrupt file yields an empty
72    /// cache (a warning is logged). The cache should never be the reason a
73    /// scan fails.
74    pub fn load(path: PathBuf, ttl: Duration) -> Self {
75        let mut cache = Self {
76            path,
77            ttl,
78            entries: HashMap::new(),
79            dirty: false,
80        };
81        let bytes = match std::fs::read(&cache.path) {
82            Ok(b) => b,
83            Err(err) if err.kind() == std::io::ErrorKind::NotFound => return cache,
84            Err(err) => {
85                tracing::warn!(error = %err, path = %cache.path.display(), "cache read failed");
86                return cache;
87            }
88        };
89        let parsed: CacheFile = match serde_json::from_slice(&bytes) {
90            Ok(f) => f,
91            Err(err) => {
92                tracing::warn!(error = %err, "cache file corrupt; starting empty");
93                return cache;
94            }
95        };
96        if parsed.version != CACHE_VERSION {
97            tracing::info!(
98                found = parsed.version,
99                expected = CACHE_VERSION,
100                "cache version mismatch; starting empty"
101            );
102            return cache;
103        }
104        let now = now_unix();
105        let ttl_secs = ttl.as_secs();
106        for stored in parsed.entries {
107            if now.saturating_sub(stored.stored_at) > ttl_secs {
108                cache.dirty = true; // expired entry pruned; persist the smaller file
109                continue;
110            }
111            cache.entries.insert(
112                (stored.site, stored.username),
113                Entry {
114                    signature: stored.signature,
115                    stored_at: stored.stored_at,
116                    outcome: stored.outcome,
117                },
118            );
119        }
120        cache
121    }
122
123    /// Look up a cached outcome for `site` + `username`.
124    ///
125    /// Returns `None` on a miss, a TTL expiry, or a site-signature mismatch
126    /// (the site definition changed since the entry was stored).
127    pub fn get(&self, site: &Site, username: &Username) -> Option<CheckOutcome> {
128        let key = (site.name.clone(), username.as_str().to_owned());
129        let entry = self.entries.get(&key)?;
130        if entry.signature != signature(site) {
131            return None;
132        }
133        if now_unix().saturating_sub(entry.stored_at) > self.ttl.as_secs() {
134            return None;
135        }
136        let mut outcome = entry.outcome.clone();
137        outcome.refresh_confidence();
138        Some(outcome)
139    }
140
141    /// Store an outcome. `Uncertain` outcomes are ignored (not cached).
142    pub fn put(&mut self, site: &Site, username: &Username, outcome: CheckOutcome) {
143        if matches!(outcome.kind, MatchKind::Uncertain) {
144            return;
145        }
146        let key = (site.name.clone(), username.as_str().to_owned());
147        self.entries.insert(
148            key,
149            Entry {
150                signature: signature(site),
151                stored_at: now_unix(),
152                outcome,
153            },
154        );
155        self.dirty = true;
156    }
157
158    /// Persist the cache to disk if anything changed since load. Writes
159    /// atomically (temp file + rename) and creates parent directories.
160    pub fn save(&self) -> Result<()> {
161        if !self.dirty {
162            return Ok(());
163        }
164        if let Some(parent) = self.path.parent() {
165            std::fs::create_dir_all(parent)?;
166        }
167        let mut entries: Vec<StoredEntry> = self
168            .entries
169            .iter()
170            .map(|((site, username), entry)| StoredEntry {
171                site: site.clone(),
172                username: username.clone(),
173                signature: entry.signature,
174                stored_at: entry.stored_at,
175                outcome: entry.outcome.clone(),
176            })
177            .collect();
178        entries.sort_by(|a, b| {
179            a.site
180                .cmp(&b.site)
181                .then_with(|| a.username.cmp(&b.username))
182        });
183        let file = CacheFile {
184            version: CACHE_VERSION,
185            entries,
186        };
187        let json = serde_json::to_string_pretty(&file)?;
188        let tmp = self.path.with_extension("json.tmp");
189        std::fs::write(&tmp, json)?;
190        std::fs::rename(&tmp, &self.path)?;
191        Ok(())
192    }
193
194    /// Number of live entries.
195    pub fn len(&self) -> usize {
196        self.entries.len()
197    }
198
199    /// True if the cache has no entries.
200    pub fn is_empty(&self) -> bool {
201        self.entries.is_empty()
202    }
203
204    /// Delete the cache file at `path`. A missing file is not an error.
205    pub fn clear(path: &Path) -> Result<()> {
206        match std::fs::remove_file(path) {
207            Ok(()) => Ok(()),
208            Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(()),
209            Err(err) => Err(err.into()),
210        }
211    }
212
213    /// Default cache file location: `$XDG_CACHE_HOME/adler/cache.json`,
214    /// falling back to `$HOME/.cache/adler/cache.json`, then a relative
215    /// path if neither env var is set.
216    pub fn default_path() -> PathBuf {
217        if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
218            return PathBuf::from(xdg).join("adler").join("cache.json");
219        }
220        if let Some(home) = std::env::var_os("HOME") {
221            return PathBuf::from(home)
222                .join(".cache")
223                .join("adler")
224                .join("cache.json");
225        }
226        PathBuf::from("adler-cache.json")
227    }
228}
229
230/// Deterministic FNV-1a hash of a site's URL template and signal list.
231///
232/// Must be stable across processes, so we cannot use the std `DefaultHasher`
233/// (it's randomly seeded). FNV-1a over the serialized signals + URL is
234/// deterministic and collision-resistant enough for cache invalidation.
235fn signature(site: &Site) -> u64 {
236    let signals = serde_json::to_string(&site.signals).unwrap_or_default();
237    let mut hash = FNV_OFFSET;
238    for byte in site.url.as_str().bytes().chain(signals.bytes()) {
239        hash ^= u64::from(byte);
240        hash = hash.wrapping_mul(FNV_PRIME);
241    }
242    hash
243}
244
245fn now_unix() -> u64 {
246    SystemTime::now()
247        .duration_since(UNIX_EPOCH)
248        .map_or(0, |d| d.as_secs())
249}
250
251#[cfg(test)]
252mod tests {
253    use super::*;
254    use crate::site::{Signal, UrlTemplate};
255
256    fn site(name: &str) -> Site {
257        Site {
258            name: name.into(),
259            url: UrlTemplate::new("https://example.com/{username}").unwrap(),
260            signals: vec![Signal::StatusFound { codes: vec![200] }],
261            known_present: None,
262            known_absent: None,
263            extract: Vec::new(),
264            tags: Vec::new(),
265            request_headers: std::collections::BTreeMap::new(),
266            regex_check: None,
267            engine: None,
268            strip_bad_char: None,
269            request_method: crate::site::HttpMethod::Get,
270            request_body: None,
271            protection: Vec::new(),
272            disabled: false,
273            disabled_reason: None,
274            source: None,
275            popularity: None,
276            access: crate::AccessPolicy::default(),
277        }
278    }
279
280    fn outcome(kind: MatchKind) -> CheckOutcome {
281        CheckOutcome {
282            site: "Example".into(),
283            url: "https://example.com/alice".into(),
284            kind,
285            reason: None,
286            elapsed_ms: 5,
287            enrichment: std::collections::BTreeMap::new(),
288            evidence: Vec::new(),
289            profile_evidence: Vec::new(),
290            confidence: crate::ConfidenceScore::default(),
291            transport: None,
292            escalations: 0,
293        }
294    }
295
296    fn tmp_path(tag: &str) -> PathBuf {
297        let mut p = std::env::temp_dir();
298        p.push(format!(
299            "adler-cache-test-{tag}-{}.json",
300            std::process::id()
301        ));
302        p
303    }
304
305    fn empty_cache(ttl: Duration) -> Cache {
306        Cache {
307            path: tmp_path("mem"),
308            ttl,
309            entries: HashMap::new(),
310            dirty: false,
311        }
312    }
313
314    #[test]
315    fn put_then_get_round_trips() {
316        let mut cache = empty_cache(Duration::from_secs(3600));
317        let s = site("Example");
318        let user = Username::new("alice").unwrap();
319        cache.put(&s, &user, outcome(MatchKind::Found));
320        let got = cache.get(&s, &user).unwrap();
321        assert_eq!(got.kind, MatchKind::Found);
322    }
323
324    #[test]
325    fn uncertain_is_not_cached() {
326        let mut cache = empty_cache(Duration::from_secs(3600));
327        let s = site("Example");
328        let user = Username::new("alice").unwrap();
329        cache.put(&s, &user, outcome(MatchKind::Uncertain));
330        assert!(cache.get(&s, &user).is_none());
331        assert!(cache.is_empty());
332    }
333
334    #[test]
335    fn get_misses_on_different_username() {
336        let mut cache = empty_cache(Duration::from_secs(3600));
337        let s = site("Example");
338        cache.put(
339            &s,
340            &Username::new("alice").unwrap(),
341            outcome(MatchKind::Found),
342        );
343        assert!(cache.get(&s, &Username::new("bob").unwrap()).is_none());
344    }
345
346    #[test]
347    fn get_misses_when_signature_changes() {
348        let mut cache = empty_cache(Duration::from_secs(3600));
349        let s = site("Example");
350        let user = Username::new("alice").unwrap();
351        cache.put(&s, &user, outcome(MatchKind::Found));
352
353        // Same name, different signals → different signature → miss.
354        let mut changed = site("Example");
355        changed.signals = vec![Signal::StatusNotFound { codes: vec![404] }];
356        assert!(cache.get(&changed, &user).is_none());
357    }
358
359    #[test]
360    fn get_misses_on_expired_entry() {
361        let mut cache = empty_cache(Duration::from_secs(0));
362        let s = site("Example");
363        let user = Username::new("alice").unwrap();
364        // stored_at = now, ttl = 0 → already expired (now - stored_at > 0 is
365        // false at the same second, so force an old timestamp).
366        cache.entries.insert(
367            ("Example".into(), "alice".into()),
368            Entry {
369                signature: signature(&s),
370                stored_at: now_unix().saturating_sub(10),
371                outcome: outcome(MatchKind::Found),
372            },
373        );
374        assert!(cache.get(&s, &user).is_none());
375    }
376
377    #[test]
378    fn save_and_load_round_trip() {
379        let path = tmp_path("roundtrip");
380        let _ = std::fs::remove_file(&path);
381        let s = site("Example");
382        let user = Username::new("alice").unwrap();
383        {
384            let mut cache = Cache::load(path.clone(), Duration::from_secs(3600));
385            cache.put(&s, &user, outcome(MatchKind::Found));
386            cache.save().unwrap();
387        }
388        let reloaded = Cache::load(path.clone(), Duration::from_secs(3600));
389        let got = reloaded.get(&s, &user).unwrap();
390        assert_eq!(got.kind, MatchKind::Found);
391        let _ = std::fs::remove_file(&path);
392    }
393
394    #[test]
395    fn load_drops_expired_entries() {
396        let path = tmp_path("expiry");
397        // Write a cache file by hand with a stored_at two hours in the past.
398        let file = CacheFile {
399            version: CACHE_VERSION,
400            entries: vec![StoredEntry {
401                site: "Example".into(),
402                username: "alice".into(),
403                signature: signature(&site("Example")),
404                stored_at: now_unix().saturating_sub(7200),
405                outcome: outcome(MatchKind::Found),
406            }],
407        };
408        std::fs::write(&path, serde_json::to_string(&file).unwrap()).unwrap();
409        // TTL of 1 hour → the 2-hour-old entry is pruned.
410        let reloaded = Cache::load(path.clone(), Duration::from_secs(3600));
411        assert!(reloaded.is_empty());
412        let _ = std::fs::remove_file(&path);
413    }
414
415    #[test]
416    fn corrupt_file_yields_empty_cache() {
417        let path = tmp_path("corrupt");
418        std::fs::write(&path, b"this is not json {{{").unwrap();
419        let cache = Cache::load(path.clone(), Duration::from_secs(3600));
420        assert!(cache.is_empty());
421        let _ = std::fs::remove_file(&path);
422    }
423
424    #[test]
425    fn clear_removes_file_and_tolerates_missing() {
426        let path = tmp_path("clear");
427        std::fs::write(&path, b"{}").unwrap();
428        Cache::clear(&path).unwrap();
429        assert!(!path.exists());
430        // Second clear on a missing file is fine.
431        Cache::clear(&path).unwrap();
432    }
433
434    #[test]
435    fn signature_is_deterministic() {
436        let s = site("Example");
437        assert_eq!(signature(&s), signature(&s));
438    }
439}