Skip to main content

oxios_kernel/
readiness.rs

1//! Subsystem readiness tracking (RFC-024 SP4).
2//!
3//! A daemon can answer HTTP requests before every subsystem has finished
4//! initializing (state store loading, engine provider warm-up, etc.). Naive
5//! handling causes `500`/`Internal` errors for the first few hundred
6//! milliseconds of every restart, plus hangs when the orchestrator is
7//! permanently unavailable. This module gives callers a single atomic
8//! gate: a route is "ready" only when both the state store and the engine
9//! have reached `Ready` or `Degraded`.
10//!
11//! **Three-state model** (per subsystem):
12//! - `Warming` — startup, not yet `Ready`. Counts as "not ready".
13//! - `Ready` — fully operational. Counts as "ready".
14//! - `Degraded` — operational with limitations (e.g. engine initialized but no API key;
15//!   only a fallback model available). **Counts as "ready"** so a missing API key does
16//!   not lock the user out of `/api/status` for diagnosis.
17//! - `Failed` — startup aborted (engine init crashed). The state store is still useful
18//!   for inspection so it is allowed to become `Ready` independently; the engine `Failed`
19//!   state keeps the readiness gate closed and `/api/status` is the only API that
20//!   bypasses it (RFC-024 §7.1.1).
21//!
22//! **Deadline.** Callers set a deadline (default 30 s) after which any
23//! subsystem still in `Warming` is force-promoted to `Degraded` to prevent
24//! the gate from staying closed forever.
25
26use std::sync::atomic::{AtomicU8, AtomicU64, Ordering};
27use std::time::{SystemTime, UNIX_EPOCH};
28
29const STATE_WARMING: u8 = 0;
30const STATE_READY: u8 = 1;
31const STATE_DEGRADED: u8 = 2;
32const STATE_FAILED: u8 = 3;
33
34/// Coarse readiness of a single subsystem.
35#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
36pub enum SubsystemState {
37    /// Startup in progress.
38    Warming,
39    /// Fully operational.
40    Ready,
41    /// Operational with limitations (still counts as "ready" for the gate).
42    Degraded,
43    /// Startup aborted; the subsystem is not usable.
44    Failed,
45}
46
47impl SubsystemState {
48    fn to_u8(self) -> u8 {
49        match self {
50            Self::Warming => STATE_WARMING,
51            Self::Ready => STATE_READY,
52            Self::Degraded => STATE_DEGRADED,
53            Self::Failed => STATE_FAILED,
54        }
55    }
56    fn from_u8(v: u8) -> Self {
57        match v {
58            STATE_READY => Self::Ready,
59            STATE_DEGRADED => Self::Degraded,
60            STATE_FAILED => Self::Failed,
61            _ => Self::Warming,
62        }
63    }
64}
65
66// Manual Serialize/Deserialize without external derive (used by `KernelHandle::readiness`
67// in tests + status JSON).
68use serde::{Deserialize, Serialize};
69
70/// Readiness gate: tracks two subsystems (state store, engine) and exposes
71/// a single `is_ready()` that returns `true` when the daemon can safely
72/// serve protected API routes.
73pub struct ReadinessGate {
74    state_store: AtomicU8,
75    engine: AtomicU8,
76    /// Unix-epoch seconds at which still-Warming subsystems are force-promoted
77    /// to Degraded. `0` means "no deadline" (caller is responsible).
78    deadline_secs: AtomicU64,
79}
80
81impl ReadinessGate {
82    /// Create a new gate in `Warming` state for both subsystems. `deadline_secs`
83    /// is the wall-clock (Unix epoch) at which any still-Warming subsystem
84    /// is force-promoted to Degraded. Pass `0` to disable the deadline.
85    pub fn new(deadline_secs: u64) -> Self {
86        Self {
87            state_store: AtomicU8::new(STATE_WARMING),
88            engine: AtomicU8::new(STATE_WARMING),
89            deadline_secs: AtomicU64::new(deadline_secs),
90        }
91    }
92
93    /// Update the wall-clock deadline for force-promoting Warming → Degraded.
94    /// Pass `0` to disable enforcement.
95    pub fn set_deadline_secs(&self, secs: u64) {
96        self.deadline_secs.store(secs, Ordering::SeqCst);
97    }
98
99    /// Read the current deadline (Unix-epoch seconds, or `0` if disabled).
100    pub fn deadline_secs(&self) -> u64 {
101        self.deadline_secs.load(Ordering::SeqCst)
102    }
103
104    /// Update the state-store readiness.
105    pub fn set_state_store(&self, s: SubsystemState) {
106        self.state_store.store(s.to_u8(), Ordering::SeqCst);
107    }
108
109    /// Update the engine readiness.
110    pub fn set_engine(&self, s: SubsystemState) {
111        self.engine.store(s.to_u8(), Ordering::SeqCst);
112    }
113
114    /// Read the current state-store state.
115    pub fn state_store_state(&self) -> SubsystemState {
116        SubsystemState::from_u8(self.state_store.load(Ordering::SeqCst))
117    }
118
119    /// Read the current engine state.
120    pub fn engine_state(&self) -> SubsystemState {
121        SubsystemState::from_u8(self.engine.load(Ordering::SeqCst))
122    }
123
124    /// `true` when the gate is open: both subsystems are `Ready` or
125    /// `Degraded`. A `Failed` (or still-`Warming`) subsystem keeps the gate
126    /// closed. `Degraded` counts as ready so a missing API key (engine)
127    /// or a slow-but-functional state store does not lock the user out
128    /// after the deadline elapses (RFC-024 SP4).
129    pub fn is_ready(&self) -> bool {
130        let s = self.state_store_state();
131        let e = self.engine_state();
132        let s_ok = s == SubsystemState::Ready || s == SubsystemState::Degraded;
133        let e_ok = e == SubsystemState::Ready || e == SubsystemState::Degraded;
134        s_ok && e_ok
135    }
136
137    /// Force-promote any still-Warming subsystem to Degraded once the
138    /// deadline elapses. Idempotent. Should be called by the kernel
139    /// during init and by the readiness middleware to enforce a ceiling
140    /// on how long a misconfigured engine can lock the gate.
141    pub fn enforce_deadline(&self) {
142        let deadline = self.deadline_secs.load(Ordering::SeqCst);
143        if deadline == 0 {
144            return;
145        }
146        if self.is_ready() {
147            return;
148        }
149        let now = SystemTime::now()
150            .duration_since(UNIX_EPOCH)
151            .unwrap_or_default()
152            .as_secs();
153        if now < deadline {
154            return;
155        }
156        if self.state_store_state() == SubsystemState::Warming {
157            self.set_state_store(SubsystemState::Degraded);
158        }
159        if self.engine_state() == SubsystemState::Warming {
160            self.set_engine(SubsystemState::Degraded);
161        }
162    }
163}
164
165impl std::fmt::Debug for ReadinessGate {
166    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
167        f.debug_struct("ReadinessGate")
168            .field("state_store", &self.state_store_state())
169            .field("engine", &self.engine_state())
170            .field("is_ready", &self.is_ready())
171            .finish()
172    }
173}
174
175#[cfg(test)]
176mod tests {
177    use super::*;
178
179    #[test]
180    fn new_starts_warming_and_not_ready() {
181        let g = ReadinessGate::new(0);
182        assert!(!g.is_ready());
183        assert_eq!(g.state_store_state(), SubsystemState::Warming);
184        assert_eq!(g.engine_state(), SubsystemState::Warming);
185    }
186
187    #[test]
188    fn both_ready_means_ready() {
189        let g = ReadinessGate::new(0);
190        g.set_state_store(SubsystemState::Ready);
191        g.set_engine(SubsystemState::Ready);
192        assert!(g.is_ready());
193    }
194
195    #[test]
196    fn engine_degraded_still_counts_as_ready() {
197        let g = ReadinessGate::new(0);
198        g.set_state_store(SubsystemState::Ready);
199        g.set_engine(SubsystemState::Degraded);
200        assert!(g.is_ready());
201    }
202
203    #[test]
204    fn engine_failed_keeps_gate_closed() {
205        let g = ReadinessGate::new(0);
206        g.set_state_store(SubsystemState::Ready);
207        g.set_engine(SubsystemState::Failed);
208        assert!(!g.is_ready());
209    }
210
211    #[test]
212    fn state_store_not_ready_keeps_gate_closed() {
213        let g = ReadinessGate::new(0);
214        g.set_engine(SubsystemState::Ready);
215        assert!(!g.is_ready());
216    }
217
218    #[test]
219    fn deadline_elapsed_promotes_warming_to_degraded() {
220        // Deadline in the past.
221        let g = ReadinessGate::new(1);
222        std::thread::sleep(std::time::Duration::from_millis(1100));
223        g.enforce_deadline();
224        assert_eq!(g.state_store_state(), SubsystemState::Degraded);
225        assert_eq!(g.engine_state(), SubsystemState::Degraded);
226        assert!(g.is_ready());
227    }
228
229    #[test]
230    fn deadline_not_yet_elapsed_keeps_warming() {
231        let deadline = SystemTime::now()
232            .duration_since(UNIX_EPOCH)
233            .unwrap()
234            .as_secs()
235            + 60;
236        let g = ReadinessGate::new(deadline);
237        g.enforce_deadline();
238        assert_eq!(g.state_store_state(), SubsystemState::Warming);
239        assert!(!g.is_ready());
240    }
241
242    #[test]
243    fn deadline_zero_disables_enforcement() {
244        let g = ReadinessGate::new(0);
245        g.enforce_deadline();
246        assert_eq!(g.state_store_state(), SubsystemState::Warming);
247    }
248}