Skip to main content

ai_memory/hooks/
timeouts.rs

1// Copyright 2026 AlphaOne LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// v0.7 Track G — Task G6: per-event-class hard timeouts.
5//
6// G2 (PR #563) shipped the 20-variant `HookEvent`. G3 (PR #567)
7// shipped per-hook `timeout_ms` enforcement inside the executor. G5
8// (PR #573) shipped `HookChain::fire` which iterates hooks in
9// priority order. G6 stitches the bound on the *whole chain*: a
10// hook chain firing on a hot event (recall, search, index) cannot
11// collectively burn more wall-clock than the event class allows,
12// even if individual `timeout_ms` knobs would otherwise sum past
13// the budget.
14//
15// # Why a class deadline at all
16//
17// The v0.6.3 recall path holds a 50ms p95 budget. If three hooks
18// each set `timeout_ms = 1_000` subscribe to `post_recall`, a
19// single slow hook can blow the recall budget by 20×. Per-hook
20// timeouts protect the *individual* hook from a runaway script;
21// per-class timeouts protect the *operation* from the chain.
22//
23// # The four classes
24//
25// Per V0.7-EPIC §G6, every `HookEvent` lands in exactly one of:
26//
27//   * Write       — pre/post_store, pre/post_delete, pre/post_promote,
28//                   pre/post_link, pre/post_consolidate,
29//                   pre/post_governance_decision, pre_archive.
30//                   5000ms class deadline. Writes are user-initiated
31//                   and rarer than reads, so we tolerate a longer
32//                   chain (PII redaction → policy gate → audit emit
33//                   chains can legitimately exceed 1s).
34//   * Read        — pre/post_recall, pre/post_search.
35//                   2000ms class deadline. Reads are the hot path;
36//                   the budget is generous enough for a real
37//                   guardrail hook (token classifier, RBAC check) but
38//                   below the 5s write ceiling.
39//   * Index       — on_index_eviction.
40//                   1000ms class deadline. Index events fire from a
41//                   maintenance background loop; a slow chain there
42//                   cascades into an HNSW build stall.
43//   * Transcript  — pre/post_transcript_store.
44//                   5000ms class deadline. Transcripts are user-
45//                   initiated like writes, but can carry MB-scale
46//                   payloads where compression / classification hooks
47//                   plausibly take a second or more.
48//
49// # How the budget is plumbed into `HookChain::fire`
50//
51// `HookChain::fire` (in `chain.rs`) computes the class deadline at
52// entry: `chain_deadline = Instant::now() + class_deadline_for(event)`.
53// Before firing each hook it derives the per-hook budget as
54// `min(chain_deadline - now, hook.timeout_ms)`. The executor
55// already enforces `timeout_ms` via `tokio::time::timeout`; G6
56// shrinks that knob on the fly when the chain itself is running out
57// of room. If the chain budget is fully consumed before the next
58// hook fires, the chain logs a warning, increments the
59// `timeout_violations` counter, and treats the remaining hooks as
60// fail-open `Allow` per G5's default `FailMode::Open` posture.
61//
62// # Doctor surface
63//
64// The chain accumulates a process-wide `timeout_violations` counter
65// (one global atomic, since the chain is built per-event and torn
66// down at end-of-fire — there's no per-chain home for state). The
67// doctor's `--hooks` block reads it via [`timeout_violations_total`]
68// and renders it alongside G3's existing `events_fired /
69// events_dropped / mean_latency_us` row.
70
71use std::sync::atomic::{AtomicU64, Ordering};
72use std::time::{Duration, Instant};
73
74use super::events::HookEvent;
75
76// ---------------------------------------------------------------------------
77// EventClass — the four budget buckets
78// ---------------------------------------------------------------------------
79
80/// Coarse classification of a [`HookEvent`] for per-class deadline
81/// enforcement.
82///
83/// `Copy + Hash` so it can be a `HashMap` key in downstream code
84/// (today the deadline table is a `match`, not a map; the derive
85/// cost is zero and keeps options open for the doctor).
86#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
87pub enum EventClass {
88    /// State-mutating events: store / delete / promote / link /
89    /// consolidate / governance / archive.
90    Write,
91    /// Query events: recall / search. Hottest path; tightest
92    /// non-index budget.
93    Read,
94    /// HNSW index lifecycle events. Background maintenance loop.
95    Index,
96    /// Transcript I-track events. Same 5s budget as writes; called
97    /// out separately because the payload shape and call-site
98    /// pressure profile differ.
99    Transcript,
100    /// G10: synchronous hot-path hooks that fire *inside* the recall
101    /// p95 budget (50ms). Today's only inhabitant is
102    /// [`HookEvent::PreRecallExpand`]; future synchronous hot-path
103    /// hooks (e.g. a `pre_search_expand`) would join this class. The
104    /// 50ms ceiling is below the v0.6.3 recall budget by design — a
105    /// hook that can't return a decision in 50ms cannot be wired on
106    /// the read path without blowing SLO.
107    HotPath,
108}
109
110// ---------------------------------------------------------------------------
111// Class deadlines — hardcoded per V0.7-EPIC §G6
112// ---------------------------------------------------------------------------
113
114/// Class deadline for [`EventClass::Write`].
115pub const WRITE_CLASS_DEADLINE_MS: u64 = 5_000;
116/// Class deadline for [`EventClass::Read`].
117pub const READ_CLASS_DEADLINE_MS: u64 = 2_000;
118/// Class deadline for [`EventClass::Index`].
119pub const INDEX_CLASS_DEADLINE_MS: u64 = 1_000;
120/// Class deadline for [`EventClass::Transcript`].
121pub const TRANSCRIPT_CLASS_DEADLINE_MS: u64 = 5_000;
122/// G10 — class deadline for [`EventClass::HotPath`] (synchronous
123/// recall-budget hooks). 50ms = the v0.6.3 recall p95 budget; a
124/// hook that runs longer would blow the SLO. The class deadline is
125/// the *whole-chain* ceiling — individual hook `timeout_ms` may be
126/// configured smaller.
127pub const HOT_PATH_CLASS_DEADLINE_MS: u64 = 50;
128
129// ---------------------------------------------------------------------------
130// event_class — the canonical mapping
131// ---------------------------------------------------------------------------
132
133/// Map a [`HookEvent`] to its [`EventClass`]. Total over the 25
134/// variants — the compiler's exhaustiveness check enforces the table
135/// stays in sync if a 26th event ever lands.
136#[must_use]
137pub fn event_class(event: HookEvent) -> EventClass {
138    match event {
139        // Writes: state-mutating memory operations.
140        HookEvent::PreStore
141        | HookEvent::PostStore
142        | HookEvent::PreDelete
143        | HookEvent::PostDelete
144        | HookEvent::PrePromote
145        | HookEvent::PostPromote
146        | HookEvent::PreLink
147        | HookEvent::PostLink
148        | HookEvent::PreConsolidate
149        | HookEvent::PostConsolidate
150        | HookEvent::PreGovernanceDecision
151        | HookEvent::PostGovernanceDecision
152        | HookEvent::PreArchive
153        // v0.7.0 Task 6/8: reflect lifecycle fires on the write
154        // path (the substrate inserts the new reflection memory +
155        // N reflects_on links inside a single transaction).
156        | HookEvent::PreReflect
157        | HookEvent::PostReflect
158        // v0.7.0 L1-7: compaction pipeline events are write-class
159        // (the pass may delete source rows and insert a summary).
160        | HookEvent::PreCompaction
161        | HookEvent::OnCompactionRollback => EventClass::Write,
162        // Reads: query path. Hot.
163        HookEvent::PreRecall
164        | HookEvent::PostRecall
165        | HookEvent::PreSearch
166        | HookEvent::PostSearch => EventClass::Read,
167        // Index: HNSW lifecycle.
168        HookEvent::OnIndexEviction => EventClass::Index,
169        // Transcripts: I-track interop.
170        HookEvent::PreTranscriptStore | HookEvent::PostTranscriptStore => EventClass::Transcript,
171        // G10: synchronous hot-path query expansion (50ms budget).
172        HookEvent::PreRecallExpand => EventClass::HotPath,
173    }
174}
175
176/// The hardcoded class deadline (as a [`Duration`]) for `class`.
177/// The `match` mirrors [`event_class`] inverse-style; a single
178/// branch means the compiler inlines this to a constant load at
179/// every call site.
180///
181/// **Issue #1207 — macOS timing-budget multiplier.** When running on
182/// macOS under parallel `cargo test` load, `fork+exec` of even a tiny
183/// shell script regularly takes >1s on a stressed dev host (Apple
184/// Silicon m1/m2/m3 alike). The 1000ms `Index` class deadline races
185/// the spawn budget and the test surfaces as a timeout — independent
186/// of the EAGAIN/ENOMEM/EMFILE spawn-errno class the rest of #1207
187/// addresses. The `AI_MEMORY_TEST_TIMING_BUDGET_MULT` env var is a
188/// test-only multiplier (default `1`) that scales every class deadline
189/// at runtime so tests can opt into a wider budget without changing
190/// production behaviour. The factory test runner sets this to `3` on
191/// macOS via the `tests/hooks_executor_test.rs` setup; production
192/// daemons inherit the unset default.
193///
194/// Compiled out of release binaries entirely via `cfg(any(test,
195/// debug_assertions))`. Production runs see the hardcoded constants
196/// at zero overhead — the env-var read fires only for `cargo test`.
197#[must_use]
198pub fn class_deadline(class: EventClass) -> Duration {
199    let base_ms = match class {
200        EventClass::Write => WRITE_CLASS_DEADLINE_MS,
201        EventClass::Read => READ_CLASS_DEADLINE_MS,
202        EventClass::Index => INDEX_CLASS_DEADLINE_MS,
203        EventClass::Transcript => TRANSCRIPT_CLASS_DEADLINE_MS,
204        EventClass::HotPath => HOT_PATH_CLASS_DEADLINE_MS,
205    };
206    Duration::from_millis(base_ms.saturating_mul(test_timing_budget_mult()))
207}
208
209/// Test-only timing budget multiplier. Reads
210/// `AI_MEMORY_TEST_TIMING_BUDGET_MULT` from the environment on each
211/// call (no caching) so individual tests can set it just-in-time;
212/// defaults to `1` (production behaviour). Compiled out of release
213/// builds entirely. The env-var read is a few-microsecond syscall —
214/// negligible relative to even the tightest 50ms `HotPath` budget.
215#[cfg(any(test, debug_assertions))]
216fn test_timing_budget_mult() -> u64 {
217    std::env::var("AI_MEMORY_TEST_TIMING_BUDGET_MULT")
218        .ok()
219        .and_then(|s| s.parse::<u64>().ok())
220        .filter(|&n| (1..=100).contains(&n))
221        .unwrap_or(1)
222}
223
224/// Production builds: always 1. Optimizer constant-folds the
225/// `saturating_mul` call above into a no-op.
226#[cfg(not(any(test, debug_assertions)))]
227#[inline(always)]
228fn test_timing_budget_mult() -> u64 {
229    1
230}
231
232/// Convenience wrapper: `class_deadline(event_class(event))`. Used
233/// at `HookChain::fire` entry to compute the wall-clock ceiling on
234/// the entire chain.
235#[must_use]
236pub fn class_deadline_for_event(event: HookEvent) -> Duration {
237    class_deadline(event_class(event))
238}
239
240// ---------------------------------------------------------------------------
241// Per-hook budget derivation
242// ---------------------------------------------------------------------------
243
244/// Compute the per-hook timeout budget (in milliseconds) given:
245///
246///   * `chain_deadline` — the absolute `Instant` at which the chain
247///     itself runs out of room (set at `HookChain::fire` entry).
248///   * `now`            — the `Instant` *just before* this hook fires;
249///     the chain calls this between hooks so the per-hook budget
250///     shrinks monotonically as earlier hooks consume time.
251///   * `hook_timeout_ms` — the hook's own configured `timeout_ms`.
252///
253/// Returns `Some(budget_ms)` if the chain still has any time left,
254/// `None` if the deadline has already passed (caller treats that as
255/// a class-deadline trip — log warning, increment violation counter,
256/// fail-open `Allow`).
257///
258/// The result is the smaller of the two budgets — the chain
259/// deadline floor and the hook's own ceiling. `u32`-sized to match
260/// `HookConfig.timeout_ms`; durations beyond `u32::MAX ms` (~49d)
261/// would saturate, which is fine because the class deadlines are
262/// in-the-low-seconds.
263#[must_use]
264pub fn per_hook_budget_ms(
265    chain_deadline: Instant,
266    now: Instant,
267    hook_timeout_ms: u32,
268) -> Option<u32> {
269    if now >= chain_deadline {
270        return None;
271    }
272    let remaining = chain_deadline.saturating_duration_since(now);
273    let remaining_ms = u32::try_from(remaining.as_millis()).unwrap_or(u32::MAX);
274    Some(remaining_ms.min(hook_timeout_ms))
275}
276
277// ---------------------------------------------------------------------------
278// timeout_violations_total — process-wide counter
279// ---------------------------------------------------------------------------
280
281/// Process-wide count of class-deadline trips. Bumped by the chain
282/// runner every time a hook's per-hook budget came back as `None`
283/// (i.e. the class deadline expired before the hook even got to
284/// fire) AND every time a hook returned an `ExecutorError::Timeout`
285/// because the *shrunk* budget tripped inside the executor.
286///
287/// A global atomic (rather than a per-chain field) because:
288///
289///   * `HookChain` is built per-event and discarded at end-of-fire
290///     — there's no long-lived home for the counter on the chain
291///     itself.
292///   * The `ExecutorRegistry` does have a long-lived per-hook
293///     metrics struct, but timeout *violations* are a chain-level
294///     concept (the executor only knows it tripped its own
295///     `timeout_ms`; it doesn't know whether that was the
296///     operator-configured ceiling or the chain-derived floor).
297///   * `AtomicU64` is lock-free and the bump path is on the failure
298///     branch only, so there's no measurable contention.
299///
300/// The doctor reads this via [`timeout_violations_total`] and
301/// renders it next to G3's `events_fired / events_dropped` row.
302static TIMEOUT_VIOLATIONS: AtomicU64 = AtomicU64::new(0);
303
304/// Increment the process-wide violation counter. Called by the
305/// chain runner.
306pub fn record_timeout_violation() {
307    TIMEOUT_VIOLATIONS.fetch_add(1, Ordering::Relaxed);
308}
309
310/// Snapshot of the process-wide violation counter. Read by the
311/// doctor surface.
312#[must_use]
313pub fn timeout_violations_total() -> u64 {
314    TIMEOUT_VIOLATIONS.load(Ordering::Relaxed)
315}
316
317/// Reset the violation counter. Test-only — production never
318/// resets, since the doctor relies on a monotonic count to detect
319/// "did we trip a budget since boot?".
320#[cfg(test)]
321pub fn reset_timeout_violations_for_test() {
322    TIMEOUT_VIOLATIONS.store(0, Ordering::Relaxed);
323}
324
325// ---------------------------------------------------------------------------
326// Tests
327// ---------------------------------------------------------------------------
328
329#[cfg(test)]
330mod tests {
331    use super::*;
332
333    /// Every `HookEvent` variant must classify into exactly one
334    /// `EventClass`. Table-driven so adding a 26th variant without
335    /// updating the mapping fails this test (the compiler also
336    /// flags the missing arm in `event_class`, but the assertion
337    /// surface here is what an operator reading the test reads).
338    #[test]
339    fn event_class_table_covers_all_25_variants() {
340        let table = [
341            // Write — 17 variants (Task 6/8 added pre_reflect + post_reflect;
342            // L1-7 added pre_compaction + on_compaction_rollback).
343            (HookEvent::PreStore, EventClass::Write),
344            (HookEvent::PostStore, EventClass::Write),
345            (HookEvent::PreDelete, EventClass::Write),
346            (HookEvent::PostDelete, EventClass::Write),
347            (HookEvent::PrePromote, EventClass::Write),
348            (HookEvent::PostPromote, EventClass::Write),
349            (HookEvent::PreLink, EventClass::Write),
350            (HookEvent::PostLink, EventClass::Write),
351            (HookEvent::PreConsolidate, EventClass::Write),
352            (HookEvent::PostConsolidate, EventClass::Write),
353            (HookEvent::PreGovernanceDecision, EventClass::Write),
354            (HookEvent::PostGovernanceDecision, EventClass::Write),
355            (HookEvent::PreArchive, EventClass::Write),
356            (HookEvent::PreReflect, EventClass::Write),
357            (HookEvent::PostReflect, EventClass::Write),
358            (HookEvent::PreCompaction, EventClass::Write),
359            (HookEvent::OnCompactionRollback, EventClass::Write),
360            // Read — 4 variants.
361            (HookEvent::PreRecall, EventClass::Read),
362            (HookEvent::PostRecall, EventClass::Read),
363            (HookEvent::PreSearch, EventClass::Read),
364            (HookEvent::PostSearch, EventClass::Read),
365            // Index — 1 variant.
366            (HookEvent::OnIndexEviction, EventClass::Index),
367            // Transcript — 2 variants.
368            (HookEvent::PreTranscriptStore, EventClass::Transcript),
369            (HookEvent::PostTranscriptStore, EventClass::Transcript),
370            // HotPath — 1 variant (G10).
371            (HookEvent::PreRecallExpand, EventClass::HotPath),
372        ];
373
374        assert_eq!(
375            table.len(),
376            25,
377            "v0.7.0 L1-7 mapping must cover exactly the 25 HookEvent variants"
378        );
379        for (event, expected) in table {
380            assert_eq!(
381                event_class(event),
382                expected,
383                "event {event:?} mis-classified"
384            );
385        }
386    }
387
388    #[test]
389    fn class_deadlines_match_epic_table() {
390        assert_eq!(
391            class_deadline(EventClass::Write),
392            Duration::from_millis(5_000)
393        );
394        assert_eq!(
395            class_deadline(EventClass::Read),
396            Duration::from_millis(2_000)
397        );
398        assert_eq!(
399            class_deadline(EventClass::Index),
400            Duration::from_millis(1_000)
401        );
402        assert_eq!(
403            class_deadline(EventClass::Transcript),
404            Duration::from_millis(5_000)
405        );
406        // G10: hot-path budget is the v0.6.3 recall p95 (50ms).
407        assert_eq!(
408            class_deadline(EventClass::HotPath),
409            Duration::from_millis(50)
410        );
411    }
412
413    #[test]
414    fn class_deadline_for_event_round_trips_through_class() {
415        // Spot-check one variant per class.
416        assert_eq!(
417            class_deadline_for_event(HookEvent::PreStore),
418            Duration::from_millis(WRITE_CLASS_DEADLINE_MS)
419        );
420        assert_eq!(
421            class_deadline_for_event(HookEvent::PostRecall),
422            Duration::from_millis(READ_CLASS_DEADLINE_MS)
423        );
424        assert_eq!(
425            class_deadline_for_event(HookEvent::OnIndexEviction),
426            Duration::from_millis(INDEX_CLASS_DEADLINE_MS)
427        );
428        assert_eq!(
429            class_deadline_for_event(HookEvent::PostTranscriptStore),
430            Duration::from_millis(TRANSCRIPT_CLASS_DEADLINE_MS)
431        );
432        // G10: PreRecallExpand is the inhabitant of HotPath.
433        assert_eq!(
434            class_deadline_for_event(HookEvent::PreRecallExpand),
435            Duration::from_millis(HOT_PATH_CLASS_DEADLINE_MS)
436        );
437    }
438
439    #[test]
440    fn per_hook_budget_takes_minimum_of_chain_and_hook() {
441        let now = Instant::now();
442        let chain_deadline = now + Duration::from_millis(500);
443
444        // Hook timeout is 200ms — chain has 500ms left, hook ceiling
445        // wins → 200.
446        let budget = per_hook_budget_ms(chain_deadline, now, 200).expect("not yet expired");
447        assert_eq!(budget, 200);
448
449        // Hook timeout is 5000ms — chain ceiling wins → ~500 (allow
450        // 1ms slop because Instant::now() inside the function call
451        // is a touch later than the test's `now`).
452        let budget = per_hook_budget_ms(chain_deadline, now, 5_000).expect("not yet expired");
453        assert!(
454            (498..=500).contains(&budget),
455            "expected ~500ms chain budget, got {budget}"
456        );
457    }
458
459    #[test]
460    fn per_hook_budget_returns_none_when_chain_deadline_passed() {
461        let now = Instant::now();
462        let chain_deadline = now - Duration::from_millis(1);
463        assert!(per_hook_budget_ms(chain_deadline, now, 1_000).is_none());
464    }
465
466    #[test]
467    fn per_hook_budget_at_exact_deadline_is_none() {
468        let now = Instant::now();
469        // `now >= chain_deadline` is the trip condition.
470        assert!(per_hook_budget_ms(now, now, 1_000).is_none());
471    }
472
473    #[test]
474    fn timeout_violations_counter_is_monotonic_and_resettable() {
475        reset_timeout_violations_for_test();
476        assert_eq!(timeout_violations_total(), 0);
477        record_timeout_violation();
478        record_timeout_violation();
479        record_timeout_violation();
480        assert_eq!(timeout_violations_total(), 3);
481        reset_timeout_violations_for_test();
482        assert_eq!(timeout_violations_total(), 0);
483    }
484
485    // ---------- Issue #1207 — timing-budget multiplier --------------------
486
487    // The multiplier env var is process-global; serialize these tests
488    // behind a Mutex so they don't race each other under parallel
489    // cargo-test load.
490    fn timing_mult_lock() -> std::sync::MutexGuard<'static, ()> {
491        use std::sync::{Mutex, OnceLock};
492        static LOCK: OnceLock<Mutex<()>> = OnceLock::new();
493        LOCK.get_or_init(|| Mutex::new(()))
494            .lock()
495            .unwrap_or_else(std::sync::PoisonError::into_inner)
496    }
497
498    fn with_mult<R>(value: Option<&str>, body: impl FnOnce() -> R) -> R {
499        let _guard = timing_mult_lock();
500        let prior = std::env::var("AI_MEMORY_TEST_TIMING_BUDGET_MULT").ok();
501        match value {
502            Some(v) => unsafe { std::env::set_var("AI_MEMORY_TEST_TIMING_BUDGET_MULT", v) },
503            None => unsafe { std::env::remove_var("AI_MEMORY_TEST_TIMING_BUDGET_MULT") },
504        }
505        let result = body();
506        match prior {
507            Some(v) => unsafe { std::env::set_var("AI_MEMORY_TEST_TIMING_BUDGET_MULT", v) },
508            None => unsafe { std::env::remove_var("AI_MEMORY_TEST_TIMING_BUDGET_MULT") },
509        }
510        result
511    }
512
513    #[test]
514    fn issue_1207_timing_mult_unset_defaults_to_one() {
515        with_mult(None, || {
516            assert_eq!(test_timing_budget_mult(), 1);
517            assert_eq!(
518                class_deadline(EventClass::Index),
519                Duration::from_millis(INDEX_CLASS_DEADLINE_MS),
520            );
521        });
522    }
523
524    #[test]
525    fn issue_1207_timing_mult_valid_scales_class_deadline() {
526        with_mult(Some("5"), || {
527            assert_eq!(test_timing_budget_mult(), 5);
528            assert_eq!(
529                class_deadline(EventClass::Index),
530                Duration::from_millis(INDEX_CLASS_DEADLINE_MS * 5),
531            );
532            assert_eq!(
533                class_deadline(EventClass::Write),
534                Duration::from_millis(WRITE_CLASS_DEADLINE_MS * 5),
535            );
536        });
537    }
538
539    #[test]
540    fn issue_1207_timing_mult_unparseable_falls_back_to_one() {
541        with_mult(Some("bogus-not-a-number"), || {
542            assert_eq!(test_timing_budget_mult(), 1);
543        });
544    }
545
546    #[test]
547    fn issue_1207_timing_mult_below_range_falls_back_to_one() {
548        with_mult(Some("0"), || {
549            assert_eq!(test_timing_budget_mult(), 1);
550        });
551    }
552
553    #[test]
554    fn issue_1207_timing_mult_above_range_falls_back_to_one() {
555        with_mult(Some("9999"), || {
556            assert_eq!(test_timing_budget_mult(), 1);
557        });
558    }
559
560    #[test]
561    fn issue_1207_timing_mult_boundary_at_one_and_hundred() {
562        with_mult(Some("1"), || assert_eq!(test_timing_budget_mult(), 1));
563        with_mult(Some("100"), || assert_eq!(test_timing_budget_mult(), 100));
564    }
565}