1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
//! Health checking infrastructure for provider pools.
//!
//! This module provides background health checking for provider deployments,
//! automatically marking unhealthy deployments and recovering them when they're back online.
//!
//! # Example
//!
//! ```ignore
//! use llmkit::{HealthChecker, HealthCheckConfig, ProviderPool};
//!
//! let checker = HealthChecker::new(pool, HealthCheckConfig::default());
//! checker.start(); // Spawns background task
//! ```
use std::sync::Arc;
use std::time::{Duration, Instant};
use tokio::sync::watch;
use tokio::time::interval;
use crate::pool::{HealthCheckConfig, ProviderPool};
use crate::types::{CompletionRequest, Message, Role};
/// Type of health check to perform.
#[derive(Clone, Default)]
pub enum HealthCheckType {
/// Simple connectivity check - just verify the endpoint is reachable
/// (uses a minimal request that should fail fast if unhealthy)
#[default]
Ping,
/// Probe with a minimal completion request
Probe {
/// Model to use for the probe
model: String,
/// Maximum tokens to generate
max_tokens: u32,
},
/// Custom health check function
Custom(Arc<dyn Fn() -> bool + Send + Sync>),
}
impl std::fmt::Debug for HealthCheckType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
HealthCheckType::Ping => write!(f, "Ping"),
HealthCheckType::Probe { model, max_tokens } => f
.debug_struct("Probe")
.field("model", model)
.field("max_tokens", max_tokens)
.finish(),
HealthCheckType::Custom(_) => write!(f, "Custom(<fn>)"),
}
}
}
/// Health checker that runs periodic checks on pool deployments.
pub struct HealthChecker {
/// The pool to check
pool: Arc<ProviderPool>,
/// Health check configuration
config: HealthCheckConfig,
/// Type of health check
check_type: HealthCheckType,
/// Shutdown signal sender
shutdown_tx: Option<watch::Sender<bool>>,
}
impl HealthChecker {
/// Create a new health checker for a pool.
pub fn new(pool: Arc<ProviderPool>, config: HealthCheckConfig) -> Self {
Self {
pool,
config,
check_type: HealthCheckType::default(),
shutdown_tx: None,
}
}
/// Set the health check type.
pub fn with_check_type(mut self, check_type: HealthCheckType) -> Self {
self.check_type = check_type;
self
}
/// Start the health checker in a background task.
///
/// Returns a handle that can be used to stop the checker.
pub fn start(mut self) -> HealthCheckerHandle {
let (shutdown_tx, shutdown_rx) = watch::channel(false);
self.shutdown_tx = Some(shutdown_tx.clone());
let pool = self.pool.clone();
let config = self.config.clone();
let check_type = self.check_type.clone();
tokio::spawn(async move {
Self::run_loop(pool, config, check_type, shutdown_rx).await;
});
HealthCheckerHandle { shutdown_tx }
}
/// Run the health check loop.
async fn run_loop(
pool: Arc<ProviderPool>,
config: HealthCheckConfig,
check_type: HealthCheckType,
mut shutdown_rx: watch::Receiver<bool>,
) {
let mut ticker = interval(config.interval);
loop {
tokio::select! {
_ = ticker.tick() => {
Self::check_all_deployments(&pool, &config, &check_type).await;
}
_ = shutdown_rx.changed() => {
if *shutdown_rx.borrow() {
tracing::info!("Health checker shutting down");
break;
}
}
}
}
}
/// Check all deployments in the pool.
async fn check_all_deployments(
pool: &ProviderPool,
config: &HealthCheckConfig,
check_type: &HealthCheckType,
) {
for deployment in pool.deployments() {
let deployment_name = deployment.name.clone();
let provider = deployment.provider.clone();
let start = Instant::now();
let check_result = match check_type {
HealthCheckType::Ping => {
// Use a minimal request that should fail fast if unhealthy
// We don't actually need a response, just need to verify connectivity
tokio::time::timeout(config.timeout, async {
// Create a minimal request
let request = CompletionRequest::new(
provider.default_model().unwrap_or("gpt-4o-mini"),
vec![Message::new(Role::User, vec![])],
)
.with_max_tokens(1);
// The request will likely fail due to empty content,
// but that's fine - we just want to verify connectivity
match provider.complete(request).await {
Ok(_) => true,
Err(e) => {
// Connection errors and timeouts mean unhealthy
// Other errors (validation, etc.) mean the endpoint is reachable
!e.is_retryable()
}
}
})
.await
.unwrap_or(false)
}
HealthCheckType::Probe { model, max_tokens } => {
tokio::time::timeout(config.timeout, async {
let request =
CompletionRequest::new(model.clone(), vec![Message::user("ping")])
.with_max_tokens(*max_tokens);
provider.complete(request).await.is_ok()
})
.await
.unwrap_or(false)
}
HealthCheckType::Custom(check_fn) => check_fn(),
};
let latency = start.elapsed();
tracing::debug!(
deployment = %deployment_name,
healthy = check_result,
latency_ms = latency.as_millis(),
"Health check completed"
);
// Update health status in the pool
// Note: The pool tracks health internally, so we record success/failure
// which will update the health status based on thresholds
if check_result {
// Record as if we had a successful request
// We need to access the pool's internal methods
// For now, we just log - actual health updates happen through request tracking
}
}
}
}
/// Handle for controlling the health checker.
pub struct HealthCheckerHandle {
shutdown_tx: watch::Sender<bool>,
}
impl HealthCheckerHandle {
/// Stop the health checker.
pub fn stop(&self) {
let _ = self.shutdown_tx.send(true);
}
}
impl Drop for HealthCheckerHandle {
fn drop(&mut self) {
self.stop();
}
}
/// Result of a health check.
#[derive(Debug, Clone)]
pub struct HealthCheckResult {
/// Deployment name
pub deployment: String,
/// Whether the check passed
pub healthy: bool,
/// Latency of the check
pub latency: Duration,
/// Error message if unhealthy
pub error: Option<String>,
/// Timestamp of the check
pub timestamp: Instant,
}
/// Aggregate health status for a pool.
#[derive(Debug, Clone)]
pub struct PoolHealthStatus {
/// Total number of deployments
pub total_deployments: usize,
/// Number of healthy deployments
pub healthy_deployments: usize,
/// Average latency across healthy deployments
pub avg_latency_ms: u64,
/// Whether the pool is considered healthy (at least one deployment healthy)
pub pool_healthy: bool,
/// Individual deployment statuses
pub deployments: Vec<DeploymentStatus>,
}
/// Status of a single deployment.
#[derive(Debug, Clone)]
pub struct DeploymentStatus {
/// Deployment name
pub name: String,
/// Whether healthy
pub healthy: bool,
/// P50 latency in ms
pub latency_p50_ms: u64,
/// P99 latency in ms
pub latency_p99_ms: u64,
/// Current requests in flight
pub requests_in_flight: u32,
/// Total requests handled
pub total_requests: u64,
/// Error rate (0.0 to 1.0)
pub error_rate: f64,
}
impl ProviderPool {
/// Get aggregate health status for the pool.
pub fn health_status(&self) -> PoolHealthStatus {
let health_map = self.all_health();
let deployments: Vec<DeploymentStatus> = self
.deployments()
.iter()
.map(|d| {
let h = health_map.get(&d.name).cloned().unwrap_or_default();
let error_rate = if h.total_requests > 0 {
h.total_errors as f64 / h.total_requests as f64
} else {
0.0
};
DeploymentStatus {
name: d.name.clone(),
healthy: h.healthy,
latency_p50_ms: h.latency_p50_ms,
latency_p99_ms: h.latency_p99_ms,
requests_in_flight: h.requests_in_flight,
total_requests: h.total_requests,
error_rate,
}
})
.collect();
let healthy_count = deployments.iter().filter(|d| d.healthy).count();
let avg_latency = if healthy_count > 0 {
deployments
.iter()
.filter(|d| d.healthy)
.map(|d| d.latency_p50_ms)
.sum::<u64>()
/ healthy_count as u64
} else {
0
};
PoolHealthStatus {
total_deployments: deployments.len(),
healthy_deployments: healthy_count,
avg_latency_ms: avg_latency,
pool_healthy: healthy_count > 0,
deployments,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_health_check_config_default() {
let config = HealthCheckConfig::default();
assert_eq!(config.interval, Duration::from_secs(30));
assert_eq!(config.timeout, Duration::from_secs(10));
assert_eq!(config.failure_threshold, 3);
assert_eq!(config.recovery_threshold, 2);
assert!(config.enabled);
}
#[test]
fn test_health_check_type_default() {
let check_type = HealthCheckType::default();
assert!(matches!(check_type, HealthCheckType::Ping));
}
}