ai_memory/governance/rule_cache.rs
1// Copyright 2026 AlphaOne LLC
2// SPDX-License-Identifier: Apache-2.0
3
4//! v0.7.0 #991 — per-instance cache for [`crate::governance::Rule`]
5//! lists keyed by [`crate::governance::AgentAction::kind`].
6//!
7//! ## Why this exists
8//!
9//! Pre-#991 every governance check (`check_agent_action*`) called
10//! [`crate::governance::RuleEngine::load_for_action`] which in turn
11//! called [`crate::governance::rules_store::list_enabled_by_kind`].
12//! That helper prepares a SQL statement, executes it, deserializes
13//! each row, AND runs Ed25519 signature verification on every signed
14//! row (the L1-6 bypass-impossibility invariant). The substrate
15//! issues a governance check on every memory write (store / link /
16//! delete / archive / forget) — so the helper fires once per write
17//! plus once per dispatched MCP / HTTP call that doesn't fast-path
18//! out, totalling 0.5-3 ms of avoidable work per write under the
19//! v0.7.0 typed-refusal envelope (#963) wire-up.
20//!
21//! ## Why this is per-instance, not process-wide (the #990 lesson)
22//!
23//! #983 shipped a process-wide singleton keyed on
24//! [`crate::governance::AgentAction::kind`] alone. The cache was
25//! reverted via #990 because multi-connection integration tests
26//! (`tests/governance_a2a_rules.rs::disabled_rule_at_peer_b_does_not_enforce_even_if_enabled_at_a`)
27//! exposed cross-connection key collisions: peer_b's load polluted
28//! peer_a's subsequent lookup. In production this was invisible
29//! (single daemon connection) but reverting was necessary to restore
30//! test correctness on the v0.7.0 ship-readiness gate.
31//!
32//! #991 takes the per-instance approach: the cache is *owned* by the
33//! Connection-bearing context (HTTP `AppState`, MCP main loop, the
34//! storage / wire-check hook installers). Multiple Connections in the
35//! same process get multiple independent caches by construction —
36//! cross-conn poisoning is structurally impossible. The `Arc<RuleCache>`
37//! is shared by reference, not via a global, so the cache lifetime
38//! tracks the daemon (or test fixture) that owns it. When the owner
39//! drops, the cache drops with it.
40//!
41//! ## Cache shape
42//!
43//! - `by_kind: RwLock<HashMap<String, Arc<Vec<Rule>>>>` keyed on the
44//! canonical kind strings emitted by `AgentAction::kind()`
45//! (`"bash"`, `"filesystem_write"`, `"network_request"`,
46//! `"process_spawn"`, `"custom:<discriminator>"`).
47//! - `get_or_load(conn, kind)` returns an `Arc<Vec<Rule>>` so callers
48//! share the snapshot without cloning the row data. The
49//! [`crate::governance::RuleEngine`] holds the `Arc` for the
50//! lifetime of a single check, then drops it.
51//! - The Ed25519 signature verify happens INSIDE
52//! [`crate::governance::rules_store::list_enabled_by_kind`] on the
53//! load path; the cache hit avoids it entirely until invalidation
54//! forces a reload.
55//!
56//! ## Invalidation — honest contract (post-#1015 doc-drift fix)
57//!
58//! **No automatic invalidation on rule writes.** The cache is
59//! **invalidate-on-restart-only** at v0.7.0:
60//!
61//! - The substrate-internal rule-write surface
62//! ([`crate::governance::rules_store::insert`] /
63//! [`crate::governance::rules_store::remove`] /
64//! [`crate::governance::rules_store::set_enabled`] /
65//! [`crate::governance::rules_store::update_signature`]) does NOT
66//! hold an `Arc<RuleCache>` reference and does NOT call
67//! [`RuleCache::invalidate_all`] after a write.
68//! - Rule writes happen exclusively via the CLI (`ai-memory rules
69//! …`), which runs as a separate process from any live daemon. The
70//! daemon's cache cannot observe a sibling-process rule write at
71//! all, regardless of whether `invalidate_all` is wired in
72//! intra-process — same effective contract.
73//! - The daemon does NOT expose an HTTP / MCP rule-write surface at
74//! v0.7.0. If a future release adds one, the wire should call
75//! [`RuleCache::invalidate_all`] explicitly before returning to the
76//! caller (or thread an `Arc<RuleCache>` through the rules_store
77//! mutators — #1015 tracks the option).
78//!
79//! **What this means in practice:** after `ai-memory rules add` /
80//! `enable` / `disable` / `remove` from the CLI, the operator must
81//! restart any running `ai-memory serve` (or MCP daemon) for the
82//! change to take effect on the daemon's cached rule set. This
83//! matches the pre-#990 #983 contract (rule changes via a separate
84//! process require daemon restart). The operator-edited rule volume
85//! is low enough that this is acceptable; the operator UI
86//! (`ai-memory rules list`) reads directly from SQLite so the
87//! source-of-truth is always current.
88//!
89//! [`RuleCache::invalidate`] and [`RuleCache::invalidate_all`] remain
90//! exposed for tests (which want a fresh cache per fixture) and for
91//! a future `--reload-rules` SIGHUP / admin endpoint that would call
92//! them explicitly. They are NOT load-bearing on the v0.7.0 hot
93//! write path.
94
95use std::collections::HashMap;
96use std::sync::{Arc, RwLock};
97
98use rusqlite::Connection;
99
100use crate::governance::rules_store::Rule;
101
102/// v0.7.0 #1020 (Agent-1 #6) — typed error for the substrate-public
103/// [`RuleCache::get_or_load`] surface. Per CLAUDE.md #964
104/// audit-closure rule, substrate-public APIs MUST use typed errors;
105/// pre-#1020 `get_or_load` returned `anyhow::Result<Arc<Vec<Rule>>>`,
106/// leaking the untyped error type across the substrate boundary at a
107/// brand-new (#991) API.
108///
109/// Variants:
110///
111/// - [`RuleCacheError::Load`] wraps a downstream load failure from
112/// [`crate::governance::rules_store::list_enabled_by_kind`]. The
113/// inner `anyhow::Error` preserves the rusqlite + context chain
114/// for diagnostics; downstream code paths that need the chain
115/// call `.0` directly or `From::<RuleCacheError>::from` into
116/// their own anyhow surface.
117/// - [`RuleCacheError::Poisoned`] surfaces an `RwLock` poison —
118/// previously swallowed with a silent fallback to the
119/// freshly-loaded snapshot. Surfacing it lets callers decide
120/// whether to refuse, retry, or rebuild the cache.
121#[derive(Debug)]
122pub enum RuleCacheError {
123 /// `list_enabled_by_kind` failed (rusqlite error or downstream
124 /// signature-verification panic).
125 Load(anyhow::Error),
126 /// The inner `RwLock` is poisoned (another thread panicked
127 /// while holding it). The legacy fallback returned the freshly-
128 /// loaded snapshot; callers can still do so via
129 /// [`RuleCacheError::is_poisoned`] but the typed signal lets
130 /// observability paths surface the panic.
131 Poisoned,
132}
133
134impl std::fmt::Display for RuleCacheError {
135 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
136 match self {
137 Self::Load(e) => write!(f, "rule_cache load failed: {e:#}"),
138 Self::Poisoned => write!(f, "rule_cache: RwLock poisoned"),
139 }
140 }
141}
142
143impl std::error::Error for RuleCacheError {
144 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
145 match self {
146 Self::Load(e) => Some(e.as_ref()),
147 Self::Poisoned => None,
148 }
149 }
150}
151
152impl From<anyhow::Error> for RuleCacheError {
153 fn from(e: anyhow::Error) -> Self {
154 Self::Load(e)
155 }
156}
157
158// Note: `From<RuleCacheError> for anyhow::Error` is provided by
159// anyhow's blanket `impl<E: StdError + Send + Sync + 'static>
160// From<E> for anyhow::Error`, since `RuleCacheError: std::error::Error`.
161// Callers using `?` against an `anyhow::Result` automatically
162// upcast — no manual conversion needed.
163
164impl RuleCacheError {
165 /// Returns true for the [`RuleCacheError::Poisoned`] variant.
166 /// Callers that want the legacy "silent fallback" posture can
167 /// branch on this to recover the freshly-loaded snapshot from
168 /// a parallel `list_enabled_by_kind` call.
169 #[must_use]
170 pub fn is_poisoned(&self) -> bool {
171 matches!(self, Self::Poisoned)
172 }
173}
174
175/// Per-instance cache of `Vec<Rule>` keyed by `AgentAction::kind()`.
176///
177/// The cache is owned by a Connection-bearing context (HTTP
178/// `AppState`, MCP main loop, or the substrate `GOVERNANCE_PRE_WRITE`
179/// / `GOVERNANCE_PRE_ACTION` hook installer that captures a
180/// long-lived Connection). Pass `&RuleCache` (or wrap in `Arc` for
181/// shared ownership) to the cached entry points
182/// (`check_agent_action_cached`, etc.). Cache hits return an
183/// `Arc<Vec<Rule>>` clone — no row data is cloned on the fast path.
184#[derive(Debug, Default)]
185pub struct RuleCache {
186 by_kind: RwLock<HashMap<String, Arc<Vec<Rule>>>>,
187}
188
189impl RuleCache {
190 /// Construct an empty cache. Cheap; safe to call per-test.
191 #[must_use]
192 pub fn new() -> Self {
193 Self::default()
194 }
195
196 /// Return the cached rule list for `kind`, loading via
197 /// [`crate::governance::rules_store::list_enabled_by_kind`] on
198 /// cache miss. The Ed25519 signature verify side-effect on the
199 /// loader path runs on miss; cache hits skip it.
200 ///
201 /// v0.7.0 #1020 (Agent-1 #6) — returns the typed
202 /// [`RuleCacheError`] enum at the substrate-public boundary
203 /// (was bare `anyhow::Result` pre-#1020).
204 /// `RuleCacheError: Into<anyhow::Error>` is implemented so
205 /// existing anyhow-chained callers can keep their `?` operator
206 /// with no signature changes.
207 ///
208 /// # Errors
209 ///
210 /// - [`RuleCacheError::Load`] wraps any error from
211 /// `list_enabled_by_kind` (rusqlite errors, signature-verify
212 /// panics).
213 /// - [`RuleCacheError::Poisoned`] indicates the inner `RwLock`
214 /// is poisoned by a prior thread panic. The legacy fallback
215 /// returned a fresh snapshot anyway; callers wanting that
216 /// posture can branch via [`RuleCacheError::is_poisoned`] and
217 /// re-call `list_enabled_by_kind` directly.
218 #[inline]
219 pub fn get_or_load(
220 &self,
221 conn: &Connection,
222 kind: &str,
223 ) -> std::result::Result<Arc<Vec<Rule>>, RuleCacheError> {
224 // Fast path: hold the read lock for the lookup + clone of
225 // the Arc; drop the guard before any further work.
226 if let Some(rules) = self
227 .by_kind
228 .read()
229 .ok()
230 .and_then(|guard| guard.get(kind).cloned())
231 {
232 return Ok(rules);
233 }
234 // Slow path: load + insert under the write lock. The Arc<Vec>
235 // we return is cloned from the inserted entry so a concurrent
236 // invalidate after this insert doesn't strand our caller with
237 // a dropped snapshot.
238 let rules = crate::governance::rules_store::list_enabled_by_kind(conn, kind)
239 .map_err(RuleCacheError::Load)?;
240 let arc = Arc::new(rules);
241 if let Ok(mut guard) = self.by_kind.write() {
242 // #1019 — check for an existing entry first (e.g., the
243 // loser of a race) BEFORE allocating a fresh String via
244 // `kind.to_string()`. Pre-#1019 the `entry(kind.to_string())`
245 // call allocated on every slow-path invocation, including
246 // the race-loser case where the allocated String was
247 // immediately dropped. The contains_key probe avoids the
248 // allocation entirely on the common race-loser path.
249 if let Some(existing) = guard.get(kind) {
250 return Ok(Arc::clone(existing));
251 }
252 // No race — we're the first to insert.
253 let entry = guard
254 .entry(kind.to_string())
255 .or_insert_with(|| Arc::clone(&arc));
256 return Ok(Arc::clone(entry));
257 }
258 // v0.7.0 #1020 — RwLock poison surfaces as a typed
259 // `RuleCacheError::Poisoned`. The legacy fallback returned
260 // the freshly-loaded snapshot silently; callers wanting that
261 // posture can recover via `err.is_poisoned()` and re-call
262 // `list_enabled_by_kind` directly. Surfacing the poison lets
263 // observability paths (metrics counter, tracing::error) see
264 // the prior thread panic.
265 Err(RuleCacheError::Poisoned)
266 }
267
268 /// Drop the cached entry for `kind`. Currently no caller takes
269 /// this path because the rules_store writers don't know the
270 /// affected kind without inspecting the row;
271 /// [`Self::invalidate_all`] is simpler.
272 #[inline]
273 pub fn invalidate(&self, kind: &str) {
274 if let Ok(mut guard) = self.by_kind.write() {
275 guard.remove(kind);
276 }
277 }
278
279 /// Drop every cached entry. Used by the rules_store write paths
280 /// (insert / remove / set_enabled / update_signature) so the next
281 /// reader rebuilds against the post-write state.
282 #[inline]
283 pub fn invalidate_all(&self) {
284 if let Ok(mut guard) = self.by_kind.write() {
285 guard.clear();
286 }
287 }
288
289 /// Number of currently-cached entries — for test inspection.
290 ///
291 /// **#1018 (2026-05-21):** poison semantics changed from "lie as
292 /// empty" to "return None via [`Self::len_checked`]". The legacy
293 /// `len()` returns `0` on poison (matches the pre-#1018 contract
294 /// some tests still depend on); call sites that need to
295 /// distinguish "empty" from "poisoned" use `len_checked`.
296 #[must_use]
297 #[inline]
298 pub fn len(&self) -> usize {
299 self.by_kind
300 .read()
301 .map(|guard| guard.len())
302 .unwrap_or_default()
303 }
304
305 /// **#1018 (2026-05-21):** poison-aware variant of [`Self::len`].
306 /// Returns `Some(len)` on a healthy lock, `None` if the `RwLock`
307 /// is poisoned. Pre-#1018 a poisoned cache reported `len() == 0`
308 /// and `is_empty() == true` — invisible to callers and test
309 /// assertions. Operator inspection paths + future health-probe
310 /// surfaces should consume this variant; legacy paths keep `len()`.
311 #[must_use]
312 pub fn len_checked(&self) -> Option<usize> {
313 self.by_kind.read().ok().map(|guard| guard.len())
314 }
315
316 /// Whether the cache is empty — for test inspection.
317 ///
318 /// **#1018:** the legacy contract treats poison as "empty" (returns
319 /// `true`). Use [`Self::is_empty_checked`] when the distinction
320 /// matters.
321 #[must_use]
322 #[inline]
323 pub fn is_empty(&self) -> bool {
324 self.len() == 0
325 }
326
327 /// **#1018 (2026-05-21):** poison-aware variant of [`Self::is_empty`].
328 /// Returns `Some(true)` when the cache is healthy AND empty,
329 /// `Some(false)` when healthy AND populated, `None` when the
330 /// `RwLock` is poisoned. Operator inspection paths use this.
331 #[must_use]
332 pub fn is_empty_checked(&self) -> Option<bool> {
333 self.by_kind.read().ok().map(|guard| guard.is_empty())
334 }
335}
336
337#[cfg(test)]
338mod tests {
339 use super::*;
340 use crate::governance::rules_store::Rule;
341
342 fn sample_rule(id: &str, kind: &str) -> Rule {
343 Rule {
344 id: id.to_string(),
345 kind: kind.to_string(),
346 matcher: "{}".to_string(),
347 severity: "log".to_string(),
348 reason: "test".to_string(),
349 namespace: String::new(),
350 created_by: "test".to_string(),
351 created_at: 0,
352 enabled: true,
353 signature: None,
354 attest_level: "unsigned".to_string(),
355 }
356 }
357
358 #[test]
359 fn invalidate_all_clears_every_entry_991() {
360 let cache = RuleCache::new();
361 {
362 let mut g = cache.by_kind.write().unwrap();
363 g.insert(
364 "bash".to_string(),
365 Arc::new(vec![sample_rule("r1", "bash")]),
366 );
367 g.insert(
368 "filesystem_write".to_string(),
369 Arc::new(vec![sample_rule("r2", "filesystem_write")]),
370 );
371 }
372 assert_eq!(cache.len(), 2);
373 cache.invalidate_all();
374 assert_eq!(cache.len(), 0);
375 assert!(cache.is_empty());
376 }
377
378 #[test]
379 fn invalidate_specific_kind_keeps_others_991() {
380 let cache = RuleCache::new();
381 {
382 let mut g = cache.by_kind.write().unwrap();
383 g.insert(
384 "bash".to_string(),
385 Arc::new(vec![sample_rule("r1", "bash")]),
386 );
387 g.insert(
388 "filesystem_write".to_string(),
389 Arc::new(vec![sample_rule("r2", "filesystem_write")]),
390 );
391 }
392 cache.invalidate("bash");
393 assert_eq!(cache.len(), 1);
394 let remaining = cache.by_kind.read().unwrap();
395 assert!(remaining.contains_key("filesystem_write"));
396 assert!(!remaining.contains_key("bash"));
397 }
398
399 #[test]
400 fn cross_instance_isolation_no_poisoning_991() {
401 // The #990 revert reason: a process-wide cache keyed only on
402 // `kind` poisoned multi-connection tests. Two `RuleCache`
403 // instances must not share entries — pinned here so any
404 // future refactor that re-introduces global state surfaces
405 // immediately.
406 let cache_a = RuleCache::new();
407 let cache_b = RuleCache::new();
408 {
409 let mut g = cache_a.by_kind.write().unwrap();
410 g.insert(
411 "filesystem_write".to_string(),
412 Arc::new(vec![sample_rule("peer-a-r", "filesystem_write")]),
413 );
414 }
415 assert_eq!(cache_a.len(), 1);
416 // cache_b never saw the insert — strict isolation.
417 assert_eq!(cache_b.len(), 0);
418 }
419
420 #[test]
421 fn rule_cache_error_implements_std_error_1020() {
422 // v0.7.0 #1020 — pin the typed-error contract: RuleCacheError
423 // implements std::error::Error, displays the inner anyhow
424 // chain on Load, and is auto-upcastable into anyhow::Error
425 // via the std::error::Error blanket impl.
426 let load_err: RuleCacheError = anyhow::anyhow!("synthetic rusqlite failure").into();
427 assert!(matches!(load_err, RuleCacheError::Load(_)));
428 assert!(!load_err.is_poisoned());
429 // Display surfaces the inner chain.
430 let display = format!("{load_err}");
431 assert!(
432 display.contains("rule_cache load failed")
433 && display.contains("synthetic rusqlite failure"),
434 "#1020: Load Display MUST surface the wrapped anyhow chain; got {display}"
435 );
436
437 let poison_err = RuleCacheError::Poisoned;
438 assert!(poison_err.is_poisoned());
439 assert!(format!("{poison_err}").contains("RwLock poisoned"));
440
441 // Auto-upcast into anyhow::Error via the blanket impl on
442 // std::error::Error — confirms `?` against an anyhow::Result
443 // still works for callers post-#1020.
444 let upcast: anyhow::Error = poison_err.into();
445 assert!(format!("{upcast:#}").contains("RwLock poisoned"));
446 }
447
448 #[test]
449 fn dropped_instance_drops_entries_991() {
450 // The original #983 design had a process-wide singleton that
451 // never freed entries until process exit. The per-instance
452 // design must let entries drop when the owner drops.
453 let weak;
454 {
455 let cache = RuleCache::new();
456 let entry = Arc::new(vec![sample_rule("r1", "bash")]);
457 weak = Arc::downgrade(&entry);
458 cache
459 .by_kind
460 .write()
461 .unwrap()
462 .insert("bash".to_string(), entry);
463 assert!(weak.upgrade().is_some(), "entry alive while cache alive");
464 }
465 // `cache` dropped → its `HashMap` dropped → the inner
466 // `Arc<Vec<Rule>>` ref count drops to zero → `Weak::upgrade`
467 // returns None.
468 assert!(
469 weak.upgrade().is_none(),
470 "cache drop must release Arc<Vec<Rule>> entries"
471 );
472 }
473}