agent-doc 0.33.0

Interactive document sessions with AI agents
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
//! # Module: supervisor::state
//!
//! Crash classifier, restart history ring buffer, and supervisor state machine.
//!
//! ## Spec
//! See `src/agent-doc/specs/supervisor.md` § Crash Recovery Policy.
//!
//! The supervisor tracks every child exit in a bounded ring buffer and uses the
//! recent-exit frequency to classify each exit as Clean, Transient, or Flapping.
//! The classification drives a state machine (Healthy → Degraded → Halted) that
//! determines whether and how the child should be restarted.
//!
//! ## State Machine
//!
//! ```text
//! on exit code c:
//!   append to ring buffer (last 10 exits with timestamps)
//!   classify:
//!     c == 0                          → Clean
//!     c != 0 AND exits_in_60s < 3    → Transient
//!     c != 0 AND exits_in_60s >= 3   → Flapping
//!   transition:
//!     Clean     → Healthy, reset consecutive counter
//!     Transient → Healthy
//!     Flapping  → Degraded, increment consecutive flap counter
//!                  5th consecutive → Halted
//! ```
//!
//! ## Invariants
//!
//! - Ring buffer is capped at 10 entries. Older entries are evicted on push.
//! - `exits_in_window` only counts non-zero exit codes within the time window.
//! - Once Halted, the policy stays Halted until externally reset (not modeled
//!   here — the `supervisor resume` CLI command calls `CrashPolicy::reset`).
//! - `RestartAction` is a value type — the caller (future `start.rs` wire-up)
//!   matches on it and performs the actual sleep/prompt/halt.

use std::collections::VecDeque;
use std::time::{Duration, Instant};

/// Maximum entries in the restart history ring buffer.
const RING_BUFFER_CAP: usize = 10;

/// Window for flap detection: exits within this duration count toward the
/// flapping threshold.
const FLAP_WINDOW: Duration = Duration::from_secs(60);

/// Number of non-zero exits within [`FLAP_WINDOW`] that triggers Flapping.
const FLAP_THRESHOLD: usize = 3;

/// Number of consecutive Flapping classifications before Halted.
const HALT_THRESHOLD: usize = 5;

/// Delay before restarting after a Transient exit.
const TRANSIENT_DELAY: Duration = Duration::from_secs(2);

/// Delay before restarting after a Flapping exit.
const FLAPPING_DELAY: Duration = Duration::from_secs(30);

/// Classification of a single child exit.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ExitClass {
    /// Exit code 0 — clean shutdown.
    Clean,
    /// Non-zero exit, but below the flapping threshold in the recent window.
    Transient,
    /// Non-zero exit with >= FLAP_THRESHOLD exits in the last 60s.
    Flapping,
}

/// Supervisor health state, surfaced via IPC `state` method.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SupervisorState {
    /// Normal operation. Restarts are immediate or short-delayed.
    Healthy,
    /// Flapping detected. Restarts use longer delays.
    Degraded,
    /// Too many consecutive failures. Supervisor will not restart until
    /// externally resumed.
    Halted,
}

impl SupervisorState {
    /// Stable string tag for IPC responses and logs.
    pub fn as_str(&self) -> &'static str {
        match self {
            SupervisorState::Healthy => "healthy",
            SupervisorState::Degraded => "degraded",
            SupervisorState::Halted => "halted",
        }
    }
}

/// What the caller should do after a child exit.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum RestartAction {
    /// Prompt the user (Enter to restart fresh, 'q' to quit).
    PromptUser,
    /// Wait `delay`, then restart with the given flags.
    RestartAfter {
        delay: Duration,
        with_continue: bool,
    },
    /// Do not restart. Supervisor is halted.
    Halt,
}

/// A single exit event recorded in the ring buffer.
#[derive(Debug, Clone)]
pub struct ExitRecord {
    /// Child exit code.
    pub code: i32,
    /// When the exit was observed.
    pub timestamp: Instant,
}

/// Bounded ring buffer of recent child exits.
#[derive(Debug)]
pub struct RestartHistory {
    entries: VecDeque<ExitRecord>,
}

impl RestartHistory {
    pub fn new() -> Self {
        Self {
            entries: VecDeque::with_capacity(RING_BUFFER_CAP),
        }
    }

    /// Record an exit. Evicts the oldest entry if at capacity.
    pub fn push(&mut self, record: ExitRecord) {
        if self.entries.len() >= RING_BUFFER_CAP {
            self.entries.pop_front();
        }
        self.entries.push_back(record);
    }

    /// Count non-zero exits within `window` of `now`.
    pub fn exits_in_window(&self, window: Duration, now: Instant) -> usize {
        let cutoff = now.checked_sub(window).unwrap_or(now);
        self.entries
            .iter()
            .filter(|r| r.code != 0 && r.timestamp >= cutoff)
            .count()
    }

    /// Number of entries in the buffer.
    #[allow(dead_code)] // API surface — used by tests
    pub fn len(&self) -> usize {
        self.entries.len()
    }
}

/// Crash policy: classifies exits, tracks state, and recommends restart actions.
#[derive(Debug)]
pub struct CrashPolicy {
    pub state: SupervisorState,
    pub history: RestartHistory,
    /// Number of consecutive Flapping classifications without an intervening
    /// Clean exit.
    consecutive_flaps: usize,
}

impl CrashPolicy {
    pub fn new() -> Self {
        Self {
            state: SupervisorState::Healthy,
            history: RestartHistory::new(),
            consecutive_flaps: 0,
        }
    }

    /// Classify an exit, record it, transition state, and return the
    /// recommended action.
    ///
    /// This is the main entry point called by the supervisor restart loop.
    pub fn on_exit(&mut self, exit_code: i32) -> RestartAction {
        self.on_exit_at(exit_code, Instant::now())
    }

    /// Like [`on_exit`](Self::on_exit) but with an explicit timestamp for
    /// deterministic testing.
    pub fn on_exit_at(&mut self, exit_code: i32, now: Instant) -> RestartAction {
        let record = ExitRecord {
            code: exit_code,
            timestamp: now,
        };
        self.history.push(record);

        let class = self.classify(exit_code, now);
        self.transition(class);
        self.action(class)
    }

    /// Classify the exit based on code and recent history.
    fn classify(&self, exit_code: i32, now: Instant) -> ExitClass {
        if exit_code == 0 {
            return ExitClass::Clean;
        }
        if self.history.exits_in_window(FLAP_WINDOW, now) >= FLAP_THRESHOLD {
            ExitClass::Flapping
        } else {
            ExitClass::Transient
        }
    }

    /// Update supervisor state based on the exit classification.
    fn transition(&mut self, class: ExitClass) {
        match class {
            ExitClass::Clean => {
                self.state = SupervisorState::Healthy;
                self.consecutive_flaps = 0;
            }
            ExitClass::Transient => {
                // Transient does not escalate — stay in current state or
                // return to Healthy if previously Degraded.
                if self.state == SupervisorState::Degraded {
                    // A transient (non-flapping) exit while Degraded means
                    // the flap storm has subsided.
                    self.state = SupervisorState::Healthy;
                    self.consecutive_flaps = 0;
                }
            }
            ExitClass::Flapping => {
                self.consecutive_flaps += 1;
                if self.consecutive_flaps >= HALT_THRESHOLD {
                    self.state = SupervisorState::Halted;
                } else {
                    self.state = SupervisorState::Degraded;
                }
            }
        }
    }

    /// Map the classification to a concrete restart action.
    fn action(&self, class: ExitClass) -> RestartAction {
        if self.state == SupervisorState::Halted {
            return RestartAction::Halt;
        }
        match class {
            ExitClass::Clean => RestartAction::PromptUser,
            ExitClass::Transient => RestartAction::RestartAfter {
                delay: TRANSIENT_DELAY,
                with_continue: true,
            },
            ExitClass::Flapping => RestartAction::RestartAfter {
                delay: FLAPPING_DELAY,
                with_continue: true,
            },
        }
    }

    /// Reset the policy to Healthy with empty history. Called by
    /// `supervisor resume` to un-halt.
    #[allow(dead_code)] // API surface — consumed by IPC resume handler (future)
    pub fn reset(&mut self) {
        self.state = SupervisorState::Healthy;
        self.history = RestartHistory::new();
        self.consecutive_flaps = 0;
    }

    /// Current number of consecutive flapping exits.
    #[allow(dead_code)] // API surface — used by tests and IPC state response (future)
    pub fn consecutive_flaps(&self) -> usize {
        self.consecutive_flaps
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Create an Instant offset by `secs` from a base.
    fn offset(base: Instant, secs: u64) -> Instant {
        base + Duration::from_secs(secs)
    }

    #[test]
    fn classify_clean_exit() {
        let mut policy = CrashPolicy::new();
        let action = policy.on_exit_at(0, Instant::now());
        assert_eq!(action, RestartAction::PromptUser);
        assert_eq!(policy.state, SupervisorState::Healthy);
    }

    #[test]
    fn classify_single_nonzero_is_transient() {
        let mut policy = CrashPolicy::new();
        let action = policy.on_exit_at(1, Instant::now());
        assert_eq!(
            action,
            RestartAction::RestartAfter {
                delay: TRANSIENT_DELAY,
                with_continue: true,
            }
        );
        assert_eq!(policy.state, SupervisorState::Healthy);
    }

    #[test]
    fn classify_three_in_60s_is_flapping() {
        let mut policy = CrashPolicy::new();
        let base = Instant::now();
        policy.on_exit_at(1, offset(base, 0));
        policy.on_exit_at(1, offset(base, 10));
        let action = policy.on_exit_at(1, offset(base, 20));
        assert_eq!(
            action,
            RestartAction::RestartAfter {
                delay: FLAPPING_DELAY,
                with_continue: true,
            }
        );
        assert_eq!(policy.state, SupervisorState::Degraded);
    }

    #[test]
    fn ring_buffer_caps_at_10() {
        let mut history = RestartHistory::new();
        let base = Instant::now();
        for i in 0..15 {
            history.push(ExitRecord {
                code: 1,
                timestamp: offset(base, i),
            });
        }
        assert_eq!(history.len(), RING_BUFFER_CAP);
    }

    #[test]
    fn exits_in_window_excludes_old() {
        let mut history = RestartHistory::new();
        let base = Instant::now();
        // Old exit — 120s ago
        history.push(ExitRecord {
            code: 1,
            timestamp: base,
        });
        // Recent exit — 10s ago
        let now = offset(base, 120);
        history.push(ExitRecord {
            code: 1,
            timestamp: offset(base, 110),
        });
        assert_eq!(
            history.exits_in_window(FLAP_WINDOW, now),
            1,
            "only the recent exit should count"
        );
    }

    #[test]
    fn flapping_transitions_to_degraded() {
        let mut policy = CrashPolicy::new();
        let base = Instant::now();
        // Push 3 exits within 60s
        policy.on_exit_at(1, offset(base, 0));
        policy.on_exit_at(1, offset(base, 1));
        policy.on_exit_at(1, offset(base, 2));
        assert_eq!(policy.state, SupervisorState::Degraded);
        assert_eq!(policy.consecutive_flaps(), 1);
    }

    #[test]
    fn five_consecutive_flaps_halts() {
        let mut policy = CrashPolicy::new();
        let base = Instant::now();
        // Each batch of 3 exits in quick succession triggers Flapping.
        // We need 5 Flapping classifications to halt.
        // After the 3rd exit in each batch, classification becomes Flapping
        // because there are >= 3 non-zero exits in the 60s window.
        // Since all exits are within 60s, every exit after the 2nd is Flapping.
        for i in 0..7 {
            policy.on_exit_at(1, offset(base, i));
        }
        // Exits: #1 Transient, #2 Transient, #3 Flapping, #4 Flapping,
        //        #5 Flapping, #6 Flapping, #7 Flapping = 5 flaps → Halted
        assert_eq!(policy.state, SupervisorState::Halted);
    }

    #[test]
    fn halted_always_returns_halt() {
        let mut policy = CrashPolicy::new();
        let base = Instant::now();
        // Drive to Halted
        for i in 0..7 {
            policy.on_exit_at(1, offset(base, i));
        }
        assert_eq!(policy.state, SupervisorState::Halted);

        // Even a clean exit while Halted returns Halt (Halted is sticky).
        // Note: clean exit does reset state to Healthy in transition(), but
        // let's verify the non-zero case stays Halted.
        let action = policy.on_exit_at(1, offset(base, 8));
        assert_eq!(action, RestartAction::Halt);
    }

    #[test]
    fn clean_exit_resets_consecutive_flaps() {
        let mut policy = CrashPolicy::new();
        let base = Instant::now();
        // Drive to Degraded (3 quick failures)
        policy.on_exit_at(1, offset(base, 0));
        policy.on_exit_at(1, offset(base, 1));
        policy.on_exit_at(1, offset(base, 2));
        assert_eq!(policy.state, SupervisorState::Degraded);
        assert!(policy.consecutive_flaps() > 0);

        // Clean exit resets
        policy.on_exit_at(0, offset(base, 3));
        assert_eq!(policy.state, SupervisorState::Healthy);
        assert_eq!(policy.consecutive_flaps(), 0);
    }

    #[test]
    fn reset_clears_everything() {
        let mut policy = CrashPolicy::new();
        let base = Instant::now();
        for i in 0..7 {
            policy.on_exit_at(1, offset(base, i));
        }
        assert_eq!(policy.state, SupervisorState::Halted);

        policy.reset();
        assert_eq!(policy.state, SupervisorState::Healthy);
        assert_eq!(policy.consecutive_flaps(), 0);
        assert_eq!(policy.history.len(), 0);
    }

    #[test]
    fn state_tag_strings_are_stable() {
        assert_eq!(SupervisorState::Healthy.as_str(), "healthy");
        assert_eq!(SupervisorState::Degraded.as_str(), "degraded");
        assert_eq!(SupervisorState::Halted.as_str(), "halted");
    }

    #[test]
    fn transient_while_degraded_recovers_to_healthy() {
        let mut policy = CrashPolicy::new();
        let base = Instant::now();
        // Drive to Degraded
        policy.on_exit_at(1, offset(base, 0));
        policy.on_exit_at(1, offset(base, 1));
        policy.on_exit_at(1, offset(base, 2));
        assert_eq!(policy.state, SupervisorState::Degraded);

        // Wait long enough that the old exits fall outside the 60s window,
        // then a single failure is Transient (< 3 in window).
        let later = offset(base, 120);
        let action = policy.on_exit_at(1, later);
        assert_eq!(
            action,
            RestartAction::RestartAfter {
                delay: TRANSIENT_DELAY,
                with_continue: true,
            }
        );
        assert_eq!(policy.state, SupervisorState::Healthy);
    }
}