ai_memory/hooks/timeouts.rs
1// Copyright 2026 AlphaOne LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// v0.7 Track G — Task G6: per-event-class hard timeouts.
5//
6// G2 (PR #563) shipped the 20-variant `HookEvent`. G3 (PR #567)
7// shipped per-hook `timeout_ms` enforcement inside the executor. G5
8// (PR #573) shipped `HookChain::fire` which iterates hooks in
9// priority order. G6 stitches the bound on the *whole chain*: a
10// hook chain firing on a hot event (recall, search, index) cannot
11// collectively burn more wall-clock than the event class allows,
12// even if individual `timeout_ms` knobs would otherwise sum past
13// the budget.
14//
15// # Why a class deadline at all
16//
17// The v0.6.3 recall path holds a 50ms p95 budget. If three hooks
18// each set `timeout_ms = 1_000` subscribe to `post_recall`, a
19// single slow hook can blow the recall budget by 20×. Per-hook
20// timeouts protect the *individual* hook from a runaway script;
21// per-class timeouts protect the *operation* from the chain.
22//
23// # The four classes
24//
25// Per V0.7-EPIC §G6, every `HookEvent` lands in exactly one of:
26//
27// * Write — pre/post_store, pre/post_delete, pre/post_promote,
28// pre/post_link, pre/post_consolidate,
29// pre/post_governance_decision, pre_archive.
30// 5000ms class deadline. Writes are user-initiated
31// and rarer than reads, so we tolerate a longer
32// chain (PII redaction → policy gate → audit emit
33// chains can legitimately exceed 1s).
34// * Read — pre/post_recall, pre/post_search.
35// 2000ms class deadline. Reads are the hot path;
36// the budget is generous enough for a real
37// guardrail hook (token classifier, RBAC check) but
38// below the 5s write ceiling.
39// * Index — on_index_eviction.
40// 1000ms class deadline. Index events fire from a
41// maintenance background loop; a slow chain there
42// cascades into an HNSW build stall.
43// * Transcript — pre/post_transcript_store.
44// 5000ms class deadline. Transcripts are user-
45// initiated like writes, but can carry MB-scale
46// payloads where compression / classification hooks
47// plausibly take a second or more.
48//
49// # How the budget is plumbed into `HookChain::fire`
50//
51// `HookChain::fire` (in `chain.rs`) computes the class deadline at
52// entry: `chain_deadline = Instant::now() + class_deadline_for(event)`.
53// Before firing each hook it derives the per-hook budget as
54// `min(chain_deadline - now, hook.timeout_ms)`. The executor
55// already enforces `timeout_ms` via `tokio::time::timeout`; G6
56// shrinks that knob on the fly when the chain itself is running out
57// of room. If the chain budget is fully consumed before the next
58// hook fires, the chain logs a warning, increments the
59// `timeout_violations` counter, and treats the remaining hooks as
60// fail-open `Allow` per G5's default `FailMode::Open` posture.
61//
62// # Doctor surface
63//
64// The chain accumulates a process-wide `timeout_violations` counter
65// (one global atomic, since the chain is built per-event and torn
66// down at end-of-fire — there's no per-chain home for state). The
67// doctor's `--hooks` block reads it via [`timeout_violations_total`]
68// and renders it alongside G3's existing `events_fired /
69// events_dropped / mean_latency_us` row.
70
71use std::sync::atomic::{AtomicU64, Ordering};
72use std::time::{Duration, Instant};
73
74use super::events::HookEvent;
75
76// ---------------------------------------------------------------------------
77// EventClass — the four budget buckets
78// ---------------------------------------------------------------------------
79
80/// Coarse classification of a [`HookEvent`] for per-class deadline
81/// enforcement.
82///
83/// `Copy + Hash` so it can be a `HashMap` key in downstream code
84/// (today the deadline table is a `match`, not a map; the derive
85/// cost is zero and keeps options open for the doctor).
86#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
87pub enum EventClass {
88 /// State-mutating events: store / delete / promote / link /
89 /// consolidate / governance / archive.
90 Write,
91 /// Query events: recall / search. Hottest path; tightest
92 /// non-index budget.
93 Read,
94 /// HNSW index lifecycle events. Background maintenance loop.
95 Index,
96 /// Transcript I-track events. Same 5s budget as writes; called
97 /// out separately because the payload shape and call-site
98 /// pressure profile differ.
99 Transcript,
100 /// G10: synchronous hot-path hooks that fire *inside* the recall
101 /// p95 budget (50ms). Today's only inhabitant is
102 /// [`HookEvent::PreRecallExpand`]; future synchronous hot-path
103 /// hooks (e.g. a `pre_search_expand`) would join this class. The
104 /// 50ms ceiling is below the v0.6.3 recall budget by design — a
105 /// hook that can't return a decision in 50ms cannot be wired on
106 /// the read path without blowing SLO.
107 HotPath,
108}
109
110// ---------------------------------------------------------------------------
111// Class deadlines — hardcoded per V0.7-EPIC §G6
112// ---------------------------------------------------------------------------
113
114/// Class deadline for [`EventClass::Write`].
115pub const WRITE_CLASS_DEADLINE_MS: u64 = 5_000;
116/// Class deadline for [`EventClass::Read`].
117pub const READ_CLASS_DEADLINE_MS: u64 = 2_000;
118/// Class deadline for [`EventClass::Index`].
119pub const INDEX_CLASS_DEADLINE_MS: u64 = 1_000;
120/// Class deadline for [`EventClass::Transcript`].
121pub const TRANSCRIPT_CLASS_DEADLINE_MS: u64 = 5_000;
122/// G10 — class deadline for [`EventClass::HotPath`] (synchronous
123/// recall-budget hooks). 50ms = the v0.6.3 recall p95 budget; a
124/// hook that runs longer would blow the SLO. The class deadline is
125/// the *whole-chain* ceiling — individual hook `timeout_ms` may be
126/// configured smaller.
127pub const HOT_PATH_CLASS_DEADLINE_MS: u64 = 50;
128
129// ---------------------------------------------------------------------------
130// event_class — the canonical mapping
131// ---------------------------------------------------------------------------
132
133/// Map a [`HookEvent`] to its [`EventClass`]. Total over the 25
134/// variants — the compiler's exhaustiveness check enforces the table
135/// stays in sync if a 26th event ever lands.
136#[must_use]
137pub fn event_class(event: HookEvent) -> EventClass {
138 match event {
139 // Writes: state-mutating memory operations.
140 HookEvent::PreStore
141 | HookEvent::PostStore
142 | HookEvent::PreDelete
143 | HookEvent::PostDelete
144 | HookEvent::PrePromote
145 | HookEvent::PostPromote
146 | HookEvent::PreLink
147 | HookEvent::PostLink
148 | HookEvent::PreConsolidate
149 | HookEvent::PostConsolidate
150 | HookEvent::PreGovernanceDecision
151 | HookEvent::PostGovernanceDecision
152 | HookEvent::PreArchive
153 // v0.7.0 Task 6/8: reflect lifecycle fires on the write
154 // path (the substrate inserts the new reflection memory +
155 // N reflects_on links inside a single transaction).
156 | HookEvent::PreReflect
157 | HookEvent::PostReflect
158 // v0.7.0 L1-7: compaction pipeline events are write-class
159 // (the pass may delete source rows and insert a summary).
160 | HookEvent::PreCompaction
161 | HookEvent::OnCompactionRollback => EventClass::Write,
162 // Reads: query path. Hot.
163 HookEvent::PreRecall
164 | HookEvent::PostRecall
165 | HookEvent::PreSearch
166 | HookEvent::PostSearch => EventClass::Read,
167 // Index: HNSW lifecycle.
168 HookEvent::OnIndexEviction => EventClass::Index,
169 // Transcripts: I-track interop.
170 HookEvent::PreTranscriptStore | HookEvent::PostTranscriptStore => EventClass::Transcript,
171 // G10: synchronous hot-path query expansion (50ms budget).
172 HookEvent::PreRecallExpand => EventClass::HotPath,
173 }
174}
175
176/// The hardcoded class deadline (as a [`Duration`]) for `class`.
177/// The `match` mirrors [`event_class`] inverse-style; a single
178/// branch means the compiler inlines this to a constant load at
179/// every call site.
180///
181/// **Issue #1207 — macOS timing-budget multiplier.** When running on
182/// macOS under parallel `cargo test` load, `fork+exec` of even a tiny
183/// shell script regularly takes >1s on a stressed dev host (Apple
184/// Silicon m1/m2/m3 alike). The 1000ms `Index` class deadline races
185/// the spawn budget and the test surfaces as a timeout — independent
186/// of the EAGAIN/ENOMEM/EMFILE spawn-errno class the rest of #1207
187/// addresses. The `AI_MEMORY_TEST_TIMING_BUDGET_MULT` env var is a
188/// test-only multiplier (default `1`) that scales every class deadline
189/// at runtime so tests can opt into a wider budget without changing
190/// production behaviour. The factory test runner sets this to `3` on
191/// macOS via the `tests/hooks_executor_test.rs` setup; production
192/// daemons inherit the unset default.
193///
194/// Compiled out of release binaries entirely via `cfg(any(test,
195/// debug_assertions))`. Production runs see the hardcoded constants
196/// at zero overhead — the env-var read fires only for `cargo test`.
197#[must_use]
198pub fn class_deadline(class: EventClass) -> Duration {
199 let base_ms = match class {
200 EventClass::Write => WRITE_CLASS_DEADLINE_MS,
201 EventClass::Read => READ_CLASS_DEADLINE_MS,
202 EventClass::Index => INDEX_CLASS_DEADLINE_MS,
203 EventClass::Transcript => TRANSCRIPT_CLASS_DEADLINE_MS,
204 EventClass::HotPath => HOT_PATH_CLASS_DEADLINE_MS,
205 };
206 Duration::from_millis(base_ms.saturating_mul(test_timing_budget_mult()))
207}
208
209/// Test-only timing budget multiplier. Reads
210/// `AI_MEMORY_TEST_TIMING_BUDGET_MULT` from the environment on each
211/// call (no caching) so individual tests can set it just-in-time;
212/// defaults to `1` (production behaviour). Compiled out of release
213/// builds entirely. The env-var read is a few-microsecond syscall —
214/// negligible relative to even the tightest 50ms `HotPath` budget.
215#[cfg(any(test, debug_assertions))]
216fn test_timing_budget_mult() -> u64 {
217 std::env::var("AI_MEMORY_TEST_TIMING_BUDGET_MULT")
218 .ok()
219 .and_then(|s| s.parse::<u64>().ok())
220 .filter(|&n| (1..=100).contains(&n))
221 .unwrap_or(1)
222}
223
224/// Production builds: always 1. Optimizer constant-folds the
225/// `saturating_mul` call above into a no-op.
226#[cfg(not(any(test, debug_assertions)))]
227#[inline(always)]
228fn test_timing_budget_mult() -> u64 {
229 1
230}
231
232/// Convenience wrapper: `class_deadline(event_class(event))`. Used
233/// at `HookChain::fire` entry to compute the wall-clock ceiling on
234/// the entire chain.
235#[must_use]
236pub fn class_deadline_for_event(event: HookEvent) -> Duration {
237 class_deadline(event_class(event))
238}
239
240// ---------------------------------------------------------------------------
241// Per-hook budget derivation
242// ---------------------------------------------------------------------------
243
244/// Compute the per-hook timeout budget (in milliseconds) given:
245///
246/// * `chain_deadline` — the absolute `Instant` at which the chain
247/// itself runs out of room (set at `HookChain::fire` entry).
248/// * `now` — the `Instant` *just before* this hook fires;
249/// the chain calls this between hooks so the per-hook budget
250/// shrinks monotonically as earlier hooks consume time.
251/// * `hook_timeout_ms` — the hook's own configured `timeout_ms`.
252///
253/// Returns `Some(budget_ms)` if the chain still has any time left,
254/// `None` if the deadline has already passed (caller treats that as
255/// a class-deadline trip — log warning, increment violation counter,
256/// fail-open `Allow`).
257///
258/// The result is the smaller of the two budgets — the chain
259/// deadline floor and the hook's own ceiling. `u32`-sized to match
260/// `HookConfig.timeout_ms`; durations beyond `u32::MAX ms` (~49d)
261/// would saturate, which is fine because the class deadlines are
262/// in-the-low-seconds.
263#[must_use]
264pub fn per_hook_budget_ms(
265 chain_deadline: Instant,
266 now: Instant,
267 hook_timeout_ms: u32,
268) -> Option<u32> {
269 if now >= chain_deadline {
270 return None;
271 }
272 let remaining = chain_deadline.saturating_duration_since(now);
273 let remaining_ms = u32::try_from(remaining.as_millis()).unwrap_or(u32::MAX);
274 Some(remaining_ms.min(hook_timeout_ms))
275}
276
277// ---------------------------------------------------------------------------
278// timeout_violations_total — process-wide counter
279// ---------------------------------------------------------------------------
280
281/// Process-wide count of class-deadline trips. Bumped by the chain
282/// runner every time a hook's per-hook budget came back as `None`
283/// (i.e. the class deadline expired before the hook even got to
284/// fire) AND every time a hook returned an `ExecutorError::Timeout`
285/// because the *shrunk* budget tripped inside the executor.
286///
287/// A global atomic (rather than a per-chain field) because:
288///
289/// * `HookChain` is built per-event and discarded at end-of-fire
290/// — there's no long-lived home for the counter on the chain
291/// itself.
292/// * The `ExecutorRegistry` does have a long-lived per-hook
293/// metrics struct, but timeout *violations* are a chain-level
294/// concept (the executor only knows it tripped its own
295/// `timeout_ms`; it doesn't know whether that was the
296/// operator-configured ceiling or the chain-derived floor).
297/// * `AtomicU64` is lock-free and the bump path is on the failure
298/// branch only, so there's no measurable contention.
299///
300/// The doctor reads this via [`timeout_violations_total`] and
301/// renders it next to G3's `events_fired / events_dropped` row.
302static TIMEOUT_VIOLATIONS: AtomicU64 = AtomicU64::new(0);
303
304/// Increment the process-wide violation counter. Called by the
305/// chain runner.
306pub fn record_timeout_violation() {
307 TIMEOUT_VIOLATIONS.fetch_add(1, Ordering::Relaxed);
308}
309
310/// Snapshot of the process-wide violation counter. Read by the
311/// doctor surface.
312#[must_use]
313pub fn timeout_violations_total() -> u64 {
314 TIMEOUT_VIOLATIONS.load(Ordering::Relaxed)
315}
316
317/// Reset the violation counter. Test-only — production never
318/// resets, since the doctor relies on a monotonic count to detect
319/// "did we trip a budget since boot?".
320#[cfg(test)]
321pub fn reset_timeout_violations_for_test() {
322 TIMEOUT_VIOLATIONS.store(0, Ordering::Relaxed);
323}
324
325// ---------------------------------------------------------------------------
326// Tests
327// ---------------------------------------------------------------------------
328
329#[cfg(test)]
330mod tests {
331 use super::*;
332
333 /// Every `HookEvent` variant must classify into exactly one
334 /// `EventClass`. Table-driven so adding a 26th variant without
335 /// updating the mapping fails this test (the compiler also
336 /// flags the missing arm in `event_class`, but the assertion
337 /// surface here is what an operator reading the test reads).
338 #[test]
339 fn event_class_table_covers_all_25_variants() {
340 let table = [
341 // Write — 17 variants (Task 6/8 added pre_reflect + post_reflect;
342 // L1-7 added pre_compaction + on_compaction_rollback).
343 (HookEvent::PreStore, EventClass::Write),
344 (HookEvent::PostStore, EventClass::Write),
345 (HookEvent::PreDelete, EventClass::Write),
346 (HookEvent::PostDelete, EventClass::Write),
347 (HookEvent::PrePromote, EventClass::Write),
348 (HookEvent::PostPromote, EventClass::Write),
349 (HookEvent::PreLink, EventClass::Write),
350 (HookEvent::PostLink, EventClass::Write),
351 (HookEvent::PreConsolidate, EventClass::Write),
352 (HookEvent::PostConsolidate, EventClass::Write),
353 (HookEvent::PreGovernanceDecision, EventClass::Write),
354 (HookEvent::PostGovernanceDecision, EventClass::Write),
355 (HookEvent::PreArchive, EventClass::Write),
356 (HookEvent::PreReflect, EventClass::Write),
357 (HookEvent::PostReflect, EventClass::Write),
358 (HookEvent::PreCompaction, EventClass::Write),
359 (HookEvent::OnCompactionRollback, EventClass::Write),
360 // Read — 4 variants.
361 (HookEvent::PreRecall, EventClass::Read),
362 (HookEvent::PostRecall, EventClass::Read),
363 (HookEvent::PreSearch, EventClass::Read),
364 (HookEvent::PostSearch, EventClass::Read),
365 // Index — 1 variant.
366 (HookEvent::OnIndexEviction, EventClass::Index),
367 // Transcript — 2 variants.
368 (HookEvent::PreTranscriptStore, EventClass::Transcript),
369 (HookEvent::PostTranscriptStore, EventClass::Transcript),
370 // HotPath — 1 variant (G10).
371 (HookEvent::PreRecallExpand, EventClass::HotPath),
372 ];
373
374 assert_eq!(
375 table.len(),
376 25,
377 "v0.7.0 L1-7 mapping must cover exactly the 25 HookEvent variants"
378 );
379 for (event, expected) in table {
380 assert_eq!(
381 event_class(event),
382 expected,
383 "event {event:?} mis-classified"
384 );
385 }
386 }
387
388 #[test]
389 fn class_deadlines_match_epic_table() {
390 assert_eq!(
391 class_deadline(EventClass::Write),
392 Duration::from_millis(5_000)
393 );
394 assert_eq!(
395 class_deadline(EventClass::Read),
396 Duration::from_millis(2_000)
397 );
398 assert_eq!(
399 class_deadline(EventClass::Index),
400 Duration::from_millis(1_000)
401 );
402 assert_eq!(
403 class_deadline(EventClass::Transcript),
404 Duration::from_millis(5_000)
405 );
406 // G10: hot-path budget is the v0.6.3 recall p95 (50ms).
407 assert_eq!(
408 class_deadline(EventClass::HotPath),
409 Duration::from_millis(50)
410 );
411 }
412
413 #[test]
414 fn class_deadline_for_event_round_trips_through_class() {
415 // Spot-check one variant per class.
416 assert_eq!(
417 class_deadline_for_event(HookEvent::PreStore),
418 Duration::from_millis(WRITE_CLASS_DEADLINE_MS)
419 );
420 assert_eq!(
421 class_deadline_for_event(HookEvent::PostRecall),
422 Duration::from_millis(READ_CLASS_DEADLINE_MS)
423 );
424 assert_eq!(
425 class_deadline_for_event(HookEvent::OnIndexEviction),
426 Duration::from_millis(INDEX_CLASS_DEADLINE_MS)
427 );
428 assert_eq!(
429 class_deadline_for_event(HookEvent::PostTranscriptStore),
430 Duration::from_millis(TRANSCRIPT_CLASS_DEADLINE_MS)
431 );
432 // G10: PreRecallExpand is the inhabitant of HotPath.
433 assert_eq!(
434 class_deadline_for_event(HookEvent::PreRecallExpand),
435 Duration::from_millis(HOT_PATH_CLASS_DEADLINE_MS)
436 );
437 }
438
439 #[test]
440 fn per_hook_budget_takes_minimum_of_chain_and_hook() {
441 let now = Instant::now();
442 let chain_deadline = now + Duration::from_millis(500);
443
444 // Hook timeout is 200ms — chain has 500ms left, hook ceiling
445 // wins → 200.
446 let budget = per_hook_budget_ms(chain_deadline, now, 200).expect("not yet expired");
447 assert_eq!(budget, 200);
448
449 // Hook timeout is 5000ms — chain ceiling wins → ~500 (allow
450 // 1ms slop because Instant::now() inside the function call
451 // is a touch later than the test's `now`).
452 let budget = per_hook_budget_ms(chain_deadline, now, 5_000).expect("not yet expired");
453 assert!(
454 (498..=500).contains(&budget),
455 "expected ~500ms chain budget, got {budget}"
456 );
457 }
458
459 #[test]
460 fn per_hook_budget_returns_none_when_chain_deadline_passed() {
461 let now = Instant::now();
462 let chain_deadline = now - Duration::from_millis(1);
463 assert!(per_hook_budget_ms(chain_deadline, now, 1_000).is_none());
464 }
465
466 #[test]
467 fn per_hook_budget_at_exact_deadline_is_none() {
468 let now = Instant::now();
469 // `now >= chain_deadline` is the trip condition.
470 assert!(per_hook_budget_ms(now, now, 1_000).is_none());
471 }
472
473 #[test]
474 fn timeout_violations_counter_is_monotonic_and_resettable() {
475 reset_timeout_violations_for_test();
476 assert_eq!(timeout_violations_total(), 0);
477 record_timeout_violation();
478 record_timeout_violation();
479 record_timeout_violation();
480 assert_eq!(timeout_violations_total(), 3);
481 reset_timeout_violations_for_test();
482 assert_eq!(timeout_violations_total(), 0);
483 }
484
485 // ---------- Issue #1207 — timing-budget multiplier --------------------
486
487 // The multiplier env var is process-global; serialize these tests
488 // behind a Mutex so they don't race each other under parallel
489 // cargo-test load.
490 fn timing_mult_lock() -> std::sync::MutexGuard<'static, ()> {
491 use std::sync::{Mutex, OnceLock};
492 static LOCK: OnceLock<Mutex<()>> = OnceLock::new();
493 LOCK.get_or_init(|| Mutex::new(()))
494 .lock()
495 .unwrap_or_else(std::sync::PoisonError::into_inner)
496 }
497
498 fn with_mult<R>(value: Option<&str>, body: impl FnOnce() -> R) -> R {
499 let _guard = timing_mult_lock();
500 let prior = std::env::var("AI_MEMORY_TEST_TIMING_BUDGET_MULT").ok();
501 match value {
502 Some(v) => unsafe { std::env::set_var("AI_MEMORY_TEST_TIMING_BUDGET_MULT", v) },
503 None => unsafe { std::env::remove_var("AI_MEMORY_TEST_TIMING_BUDGET_MULT") },
504 }
505 let result = body();
506 match prior {
507 Some(v) => unsafe { std::env::set_var("AI_MEMORY_TEST_TIMING_BUDGET_MULT", v) },
508 None => unsafe { std::env::remove_var("AI_MEMORY_TEST_TIMING_BUDGET_MULT") },
509 }
510 result
511 }
512
513 #[test]
514 fn issue_1207_timing_mult_unset_defaults_to_one() {
515 with_mult(None, || {
516 assert_eq!(test_timing_budget_mult(), 1);
517 assert_eq!(
518 class_deadline(EventClass::Index),
519 Duration::from_millis(INDEX_CLASS_DEADLINE_MS),
520 );
521 });
522 }
523
524 #[test]
525 fn issue_1207_timing_mult_valid_scales_class_deadline() {
526 with_mult(Some("5"), || {
527 assert_eq!(test_timing_budget_mult(), 5);
528 assert_eq!(
529 class_deadline(EventClass::Index),
530 Duration::from_millis(INDEX_CLASS_DEADLINE_MS * 5),
531 );
532 assert_eq!(
533 class_deadline(EventClass::Write),
534 Duration::from_millis(WRITE_CLASS_DEADLINE_MS * 5),
535 );
536 });
537 }
538
539 #[test]
540 fn issue_1207_timing_mult_unparseable_falls_back_to_one() {
541 with_mult(Some("bogus-not-a-number"), || {
542 assert_eq!(test_timing_budget_mult(), 1);
543 });
544 }
545
546 #[test]
547 fn issue_1207_timing_mult_below_range_falls_back_to_one() {
548 with_mult(Some("0"), || {
549 assert_eq!(test_timing_budget_mult(), 1);
550 });
551 }
552
553 #[test]
554 fn issue_1207_timing_mult_above_range_falls_back_to_one() {
555 with_mult(Some("9999"), || {
556 assert_eq!(test_timing_budget_mult(), 1);
557 });
558 }
559
560 #[test]
561 fn issue_1207_timing_mult_boundary_at_one_and_hundred() {
562 with_mult(Some("1"), || assert_eq!(test_timing_budget_mult(), 1));
563 with_mult(Some("100"), || assert_eq!(test_timing_budget_mult(), 100));
564 }
565}