Skip to main content

ai_memory/governance/
rule_cache.rs

1// Copyright 2026 AlphaOne LLC
2// SPDX-License-Identifier: Apache-2.0
3
4//! v0.7.0 #991 — per-instance cache for [`crate::governance::Rule`]
5//! lists keyed by [`crate::governance::AgentAction::kind`].
6//!
7//! ## Why this exists
8//!
9//! Pre-#991 every governance check (`check_agent_action*`) called
10//! [`crate::governance::RuleEngine::load_for_action`] which in turn
11//! called [`crate::governance::rules_store::list_enabled_by_kind`].
12//! That helper prepares a SQL statement, executes it, deserializes
13//! each row, AND runs Ed25519 signature verification on every signed
14//! row (the L1-6 bypass-impossibility invariant). The substrate
15//! issues a governance check on every memory write (store / link /
16//! delete / archive / forget) — so the helper fires once per write
17//! plus once per dispatched MCP / HTTP call that doesn't fast-path
18//! out, totalling 0.5-3 ms of avoidable work per write under the
19//! v0.7.0 typed-refusal envelope (#963) wire-up.
20//!
21//! ## Why this is per-instance, not process-wide (the #990 lesson)
22//!
23//! #983 shipped a process-wide singleton keyed on
24//! [`crate::governance::AgentAction::kind`] alone. The cache was
25//! reverted via #990 because multi-connection integration tests
26//! (`tests/governance_a2a_rules.rs::disabled_rule_at_peer_b_does_not_enforce_even_if_enabled_at_a`)
27//! exposed cross-connection key collisions: peer_b's load polluted
28//! peer_a's subsequent lookup. In production this was invisible
29//! (single daemon connection) but reverting was necessary to restore
30//! test correctness on the v0.7.0 ship-readiness gate.
31//!
32//! #991 takes the per-instance approach: the cache is *owned* by the
33//! Connection-bearing context (HTTP `AppState`, MCP main loop, the
34//! storage / wire-check hook installers). Multiple Connections in the
35//! same process get multiple independent caches by construction —
36//! cross-conn poisoning is structurally impossible. The `Arc<RuleCache>`
37//! is shared by reference, not via a global, so the cache lifetime
38//! tracks the daemon (or test fixture) that owns it. When the owner
39//! drops, the cache drops with it.
40//!
41//! ## Cache shape
42//!
43//! - `by_kind: RwLock<HashMap<String, Arc<Vec<Rule>>>>` keyed on the
44//!   canonical kind strings emitted by `AgentAction::kind()`
45//!   (`"bash"`, `"filesystem_write"`, `"network_request"`,
46//!   `"process_spawn"`, `"custom:<discriminator>"`).
47//! - `get_or_load(conn, kind)` returns an `Arc<Vec<Rule>>` so callers
48//!   share the snapshot without cloning the row data. The
49//!   [`crate::governance::RuleEngine`] holds the `Arc` for the
50//!   lifetime of a single check, then drops it.
51//! - The Ed25519 signature verify happens INSIDE
52//!   [`crate::governance::rules_store::list_enabled_by_kind`] on the
53//!   load path; the cache hit avoids it entirely until invalidation
54//!   forces a reload.
55//!
56//! ## Invalidation — honest contract (post-#1015 doc-drift fix)
57//!
58//! **No automatic invalidation on rule writes.** The cache is
59//! **invalidate-on-restart-only** at v0.7.0:
60//!
61//! - The substrate-internal rule-write surface
62//!   ([`crate::governance::rules_store::insert`] /
63//!   [`crate::governance::rules_store::remove`] /
64//!   [`crate::governance::rules_store::set_enabled`] /
65//!   [`crate::governance::rules_store::update_signature`]) does NOT
66//!   hold an `Arc<RuleCache>` reference and does NOT call
67//!   [`RuleCache::invalidate_all`] after a write.
68//! - Rule writes happen exclusively via the CLI (`ai-memory rules
69//!   …`), which runs as a separate process from any live daemon. The
70//!   daemon's cache cannot observe a sibling-process rule write at
71//!   all, regardless of whether `invalidate_all` is wired in
72//!   intra-process — same effective contract.
73//! - The daemon does NOT expose an HTTP / MCP rule-write surface at
74//!   v0.7.0. If a future release adds one, the wire should call
75//!   [`RuleCache::invalidate_all`] explicitly before returning to the
76//!   caller (or thread an `Arc<RuleCache>` through the rules_store
77//!   mutators — #1015 tracks the option).
78//!
79//! **What this means in practice:** after `ai-memory rules add` /
80//! `enable` / `disable` / `remove` from the CLI, the operator must
81//! restart any running `ai-memory serve` (or MCP daemon) for the
82//! change to take effect on the daemon's cached rule set. This
83//! matches the pre-#990 #983 contract (rule changes via a separate
84//! process require daemon restart). The operator-edited rule volume
85//! is low enough that this is acceptable; the operator UI
86//! (`ai-memory rules list`) reads directly from SQLite so the
87//! source-of-truth is always current.
88//!
89//! [`RuleCache::invalidate`] and [`RuleCache::invalidate_all`] remain
90//! exposed for tests (which want a fresh cache per fixture) and for
91//! a future `--reload-rules` SIGHUP / admin endpoint that would call
92//! them explicitly. They are NOT load-bearing on the v0.7.0 hot
93//! write path.
94
95use std::collections::HashMap;
96use std::sync::{Arc, RwLock};
97
98use rusqlite::Connection;
99
100use crate::governance::rules_store::Rule;
101
102/// v0.7.0 #1020 (Agent-1 #6) — typed error for the substrate-public
103/// [`RuleCache::get_or_load`] surface. Per CLAUDE.md #964
104/// audit-closure rule, substrate-public APIs MUST use typed errors;
105/// pre-#1020 `get_or_load` returned `anyhow::Result<Arc<Vec<Rule>>>`,
106/// leaking the untyped error type across the substrate boundary at a
107/// brand-new (#991) API.
108///
109/// Variants:
110///
111/// - [`RuleCacheError::Load`] wraps a downstream load failure from
112///   [`crate::governance::rules_store::list_enabled_by_kind`]. The
113///   inner `anyhow::Error` preserves the rusqlite + context chain
114///   for diagnostics; downstream code paths that need the chain
115///   call `.0` directly or `From::<RuleCacheError>::from` into
116///   their own anyhow surface.
117/// - [`RuleCacheError::Poisoned`] surfaces an `RwLock` poison —
118///   previously swallowed with a silent fallback to the
119///   freshly-loaded snapshot. Surfacing it lets callers decide
120///   whether to refuse, retry, or rebuild the cache.
121#[derive(Debug)]
122pub enum RuleCacheError {
123    /// `list_enabled_by_kind` failed (rusqlite error or downstream
124    /// signature-verification panic).
125    Load(anyhow::Error),
126    /// The inner `RwLock` is poisoned (another thread panicked
127    /// while holding it). The legacy fallback returned the freshly-
128    /// loaded snapshot; callers can still do so via
129    /// [`RuleCacheError::is_poisoned`] but the typed signal lets
130    /// observability paths surface the panic.
131    Poisoned,
132}
133
134impl std::fmt::Display for RuleCacheError {
135    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
136        match self {
137            Self::Load(e) => write!(f, "rule_cache load failed: {e:#}"),
138            Self::Poisoned => write!(f, "rule_cache: RwLock poisoned"),
139        }
140    }
141}
142
143impl std::error::Error for RuleCacheError {
144    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
145        match self {
146            Self::Load(e) => Some(e.as_ref()),
147            Self::Poisoned => None,
148        }
149    }
150}
151
152impl From<anyhow::Error> for RuleCacheError {
153    fn from(e: anyhow::Error) -> Self {
154        Self::Load(e)
155    }
156}
157
158// Note: `From<RuleCacheError> for anyhow::Error` is provided by
159// anyhow's blanket `impl<E: StdError + Send + Sync + 'static>
160// From<E> for anyhow::Error`, since `RuleCacheError: std::error::Error`.
161// Callers using `?` against an `anyhow::Result` automatically
162// upcast — no manual conversion needed.
163
164impl RuleCacheError {
165    /// Returns true for the [`RuleCacheError::Poisoned`] variant.
166    /// Callers that want the legacy "silent fallback" posture can
167    /// branch on this to recover the freshly-loaded snapshot from
168    /// a parallel `list_enabled_by_kind` call.
169    #[must_use]
170    pub fn is_poisoned(&self) -> bool {
171        matches!(self, Self::Poisoned)
172    }
173}
174
175/// Per-instance cache of `Vec<Rule>` keyed by `AgentAction::kind()`.
176///
177/// The cache is owned by a Connection-bearing context (HTTP
178/// `AppState`, MCP main loop, or the substrate `GOVERNANCE_PRE_WRITE`
179/// / `GOVERNANCE_PRE_ACTION` hook installer that captures a
180/// long-lived Connection). Pass `&RuleCache` (or wrap in `Arc` for
181/// shared ownership) to the cached entry points
182/// (`check_agent_action_cached`, etc.). Cache hits return an
183/// `Arc<Vec<Rule>>` clone — no row data is cloned on the fast path.
184#[derive(Debug, Default)]
185pub struct RuleCache {
186    by_kind: RwLock<HashMap<String, Arc<Vec<Rule>>>>,
187}
188
189impl RuleCache {
190    /// Construct an empty cache. Cheap; safe to call per-test.
191    #[must_use]
192    pub fn new() -> Self {
193        Self::default()
194    }
195
196    /// Return the cached rule list for `kind`, loading via
197    /// [`crate::governance::rules_store::list_enabled_by_kind`] on
198    /// cache miss. The Ed25519 signature verify side-effect on the
199    /// loader path runs on miss; cache hits skip it.
200    ///
201    /// v0.7.0 #1020 (Agent-1 #6) — returns the typed
202    /// [`RuleCacheError`] enum at the substrate-public boundary
203    /// (was bare `anyhow::Result` pre-#1020).
204    /// `RuleCacheError: Into<anyhow::Error>` is implemented so
205    /// existing anyhow-chained callers can keep their `?` operator
206    /// with no signature changes.
207    ///
208    /// # Errors
209    ///
210    /// - [`RuleCacheError::Load`] wraps any error from
211    ///   `list_enabled_by_kind` (rusqlite errors, signature-verify
212    ///   panics).
213    /// - [`RuleCacheError::Poisoned`] indicates the inner `RwLock`
214    ///   is poisoned by a prior thread panic. The legacy fallback
215    ///   returned a fresh snapshot anyway; callers wanting that
216    ///   posture can branch via [`RuleCacheError::is_poisoned`] and
217    ///   re-call `list_enabled_by_kind` directly.
218    #[inline]
219    pub fn get_or_load(
220        &self,
221        conn: &Connection,
222        kind: &str,
223    ) -> std::result::Result<Arc<Vec<Rule>>, RuleCacheError> {
224        // Fast path: hold the read lock for the lookup + clone of
225        // the Arc; drop the guard before any further work.
226        if let Some(rules) = self
227            .by_kind
228            .read()
229            .ok()
230            .and_then(|guard| guard.get(kind).cloned())
231        {
232            return Ok(rules);
233        }
234        // Slow path: load + insert under the write lock. The Arc<Vec>
235        // we return is cloned from the inserted entry so a concurrent
236        // invalidate after this insert doesn't strand our caller with
237        // a dropped snapshot.
238        let rules = crate::governance::rules_store::list_enabled_by_kind(conn, kind)
239            .map_err(RuleCacheError::Load)?;
240        let arc = Arc::new(rules);
241        if let Ok(mut guard) = self.by_kind.write() {
242            // #1019 — check for an existing entry first (e.g., the
243            // loser of a race) BEFORE allocating a fresh String via
244            // `kind.to_string()`. Pre-#1019 the `entry(kind.to_string())`
245            // call allocated on every slow-path invocation, including
246            // the race-loser case where the allocated String was
247            // immediately dropped. The contains_key probe avoids the
248            // allocation entirely on the common race-loser path.
249            if let Some(existing) = guard.get(kind) {
250                return Ok(Arc::clone(existing));
251            }
252            // No race — we're the first to insert.
253            let entry = guard
254                .entry(kind.to_string())
255                .or_insert_with(|| Arc::clone(&arc));
256            return Ok(Arc::clone(entry));
257        }
258        // v0.7.0 #1020 — RwLock poison surfaces as a typed
259        // `RuleCacheError::Poisoned`. The legacy fallback returned
260        // the freshly-loaded snapshot silently; callers wanting that
261        // posture can recover via `err.is_poisoned()` and re-call
262        // `list_enabled_by_kind` directly. Surfacing the poison lets
263        // observability paths (metrics counter, tracing::error) see
264        // the prior thread panic.
265        Err(RuleCacheError::Poisoned)
266    }
267
268    /// Drop the cached entry for `kind`. Currently no caller takes
269    /// this path because the rules_store writers don't know the
270    /// affected kind without inspecting the row;
271    /// [`Self::invalidate_all`] is simpler.
272    #[inline]
273    pub fn invalidate(&self, kind: &str) {
274        if let Ok(mut guard) = self.by_kind.write() {
275            guard.remove(kind);
276        }
277    }
278
279    /// Drop every cached entry. Used by the rules_store write paths
280    /// (insert / remove / set_enabled / update_signature) so the next
281    /// reader rebuilds against the post-write state.
282    #[inline]
283    pub fn invalidate_all(&self) {
284        if let Ok(mut guard) = self.by_kind.write() {
285            guard.clear();
286        }
287    }
288
289    /// Number of currently-cached entries — for test inspection.
290    ///
291    /// **#1018 (2026-05-21):** poison semantics changed from "lie as
292    /// empty" to "return None via [`Self::len_checked`]". The legacy
293    /// `len()` returns `0` on poison (matches the pre-#1018 contract
294    /// some tests still depend on); call sites that need to
295    /// distinguish "empty" from "poisoned" use `len_checked`.
296    #[must_use]
297    #[inline]
298    pub fn len(&self) -> usize {
299        self.by_kind
300            .read()
301            .map(|guard| guard.len())
302            .unwrap_or_default()
303    }
304
305    /// **#1018 (2026-05-21):** poison-aware variant of [`Self::len`].
306    /// Returns `Some(len)` on a healthy lock, `None` if the `RwLock`
307    /// is poisoned. Pre-#1018 a poisoned cache reported `len() == 0`
308    /// and `is_empty() == true` — invisible to callers and test
309    /// assertions. Operator inspection paths + future health-probe
310    /// surfaces should consume this variant; legacy paths keep `len()`.
311    #[must_use]
312    pub fn len_checked(&self) -> Option<usize> {
313        self.by_kind.read().ok().map(|guard| guard.len())
314    }
315
316    /// Whether the cache is empty — for test inspection.
317    ///
318    /// **#1018:** the legacy contract treats poison as "empty" (returns
319    /// `true`). Use [`Self::is_empty_checked`] when the distinction
320    /// matters.
321    #[must_use]
322    #[inline]
323    pub fn is_empty(&self) -> bool {
324        self.len() == 0
325    }
326
327    /// **#1018 (2026-05-21):** poison-aware variant of [`Self::is_empty`].
328    /// Returns `Some(true)` when the cache is healthy AND empty,
329    /// `Some(false)` when healthy AND populated, `None` when the
330    /// `RwLock` is poisoned. Operator inspection paths use this.
331    #[must_use]
332    pub fn is_empty_checked(&self) -> Option<bool> {
333        self.by_kind.read().ok().map(|guard| guard.is_empty())
334    }
335}
336
337#[cfg(test)]
338mod tests {
339    use super::*;
340    use crate::governance::rules_store::Rule;
341
342    fn sample_rule(id: &str, kind: &str) -> Rule {
343        Rule {
344            id: id.to_string(),
345            kind: kind.to_string(),
346            matcher: "{}".to_string(),
347            severity: "log".to_string(),
348            reason: "test".to_string(),
349            namespace: String::new(),
350            created_by: "test".to_string(),
351            created_at: 0,
352            enabled: true,
353            signature: None,
354            attest_level: "unsigned".to_string(),
355        }
356    }
357
358    #[test]
359    fn invalidate_all_clears_every_entry_991() {
360        let cache = RuleCache::new();
361        {
362            let mut g = cache.by_kind.write().unwrap();
363            g.insert(
364                "bash".to_string(),
365                Arc::new(vec![sample_rule("r1", "bash")]),
366            );
367            g.insert(
368                "filesystem_write".to_string(),
369                Arc::new(vec![sample_rule("r2", "filesystem_write")]),
370            );
371        }
372        assert_eq!(cache.len(), 2);
373        cache.invalidate_all();
374        assert_eq!(cache.len(), 0);
375        assert!(cache.is_empty());
376    }
377
378    #[test]
379    fn invalidate_specific_kind_keeps_others_991() {
380        let cache = RuleCache::new();
381        {
382            let mut g = cache.by_kind.write().unwrap();
383            g.insert(
384                "bash".to_string(),
385                Arc::new(vec![sample_rule("r1", "bash")]),
386            );
387            g.insert(
388                "filesystem_write".to_string(),
389                Arc::new(vec![sample_rule("r2", "filesystem_write")]),
390            );
391        }
392        cache.invalidate("bash");
393        assert_eq!(cache.len(), 1);
394        let remaining = cache.by_kind.read().unwrap();
395        assert!(remaining.contains_key("filesystem_write"));
396        assert!(!remaining.contains_key("bash"));
397    }
398
399    #[test]
400    fn cross_instance_isolation_no_poisoning_991() {
401        // The #990 revert reason: a process-wide cache keyed only on
402        // `kind` poisoned multi-connection tests. Two `RuleCache`
403        // instances must not share entries — pinned here so any
404        // future refactor that re-introduces global state surfaces
405        // immediately.
406        let cache_a = RuleCache::new();
407        let cache_b = RuleCache::new();
408        {
409            let mut g = cache_a.by_kind.write().unwrap();
410            g.insert(
411                "filesystem_write".to_string(),
412                Arc::new(vec![sample_rule("peer-a-r", "filesystem_write")]),
413            );
414        }
415        assert_eq!(cache_a.len(), 1);
416        // cache_b never saw the insert — strict isolation.
417        assert_eq!(cache_b.len(), 0);
418    }
419
420    #[test]
421    fn rule_cache_error_implements_std_error_1020() {
422        // v0.7.0 #1020 — pin the typed-error contract: RuleCacheError
423        // implements std::error::Error, displays the inner anyhow
424        // chain on Load, and is auto-upcastable into anyhow::Error
425        // via the std::error::Error blanket impl.
426        let load_err: RuleCacheError = anyhow::anyhow!("synthetic rusqlite failure").into();
427        assert!(matches!(load_err, RuleCacheError::Load(_)));
428        assert!(!load_err.is_poisoned());
429        // Display surfaces the inner chain.
430        let display = format!("{load_err}");
431        assert!(
432            display.contains("rule_cache load failed")
433                && display.contains("synthetic rusqlite failure"),
434            "#1020: Load Display MUST surface the wrapped anyhow chain; got {display}"
435        );
436
437        let poison_err = RuleCacheError::Poisoned;
438        assert!(poison_err.is_poisoned());
439        assert!(format!("{poison_err}").contains("RwLock poisoned"));
440
441        // Auto-upcast into anyhow::Error via the blanket impl on
442        // std::error::Error — confirms `?` against an anyhow::Result
443        // still works for callers post-#1020.
444        let upcast: anyhow::Error = poison_err.into();
445        assert!(format!("{upcast:#}").contains("RwLock poisoned"));
446    }
447
448    #[test]
449    fn dropped_instance_drops_entries_991() {
450        // The original #983 design had a process-wide singleton that
451        // never freed entries until process exit. The per-instance
452        // design must let entries drop when the owner drops.
453        let weak;
454        {
455            let cache = RuleCache::new();
456            let entry = Arc::new(vec![sample_rule("r1", "bash")]);
457            weak = Arc::downgrade(&entry);
458            cache
459                .by_kind
460                .write()
461                .unwrap()
462                .insert("bash".to_string(), entry);
463            assert!(weak.upgrade().is_some(), "entry alive while cache alive");
464        }
465        // `cache` dropped → its `HashMap` dropped → the inner
466        // `Arc<Vec<Rule>>` ref count drops to zero → `Weak::upgrade`
467        // returns None.
468        assert!(
469            weak.upgrade().is_none(),
470            "cache drop must release Arc<Vec<Rule>> entries"
471        );
472    }
473}