1use super::{ComponentHealth, HealthStatus, SystemHealth};
2use anyhow::Result;
3use chrono::Utc;
4use serde::{Deserialize, Serialize};
5use sqlx::PgPool;
6use std::collections::HashMap;
7use std::sync::Arc;
8use std::time::{Duration, Instant, SystemTime};
9use tracing::{debug, error, info, warn};
10
11#[derive(Debug, Clone)]
12pub struct HealthChecker {
13 db_pool: Arc<PgPool>,
14 start_time: SystemTime,
15 component_thresholds: HealthThresholds,
16}
17
18#[derive(Debug, Clone)]
19pub struct HealthThresholds {
20 pub max_response_time_ms: u64,
21 pub max_error_rate: f64,
22 pub max_memory_usage_percent: f64,
23 pub max_cpu_usage_percent: f64,
24 pub max_connection_pool_utilization: f64,
25}
26
27impl Default for HealthThresholds {
28 fn default() -> Self {
29 Self {
30 max_response_time_ms: 1000, max_error_rate: 0.05, max_memory_usage_percent: 80.0,
33 max_cpu_usage_percent: 90.0,
34 max_connection_pool_utilization: 80.0,
35 }
36 }
37}
38
39impl HealthChecker {
40 pub fn new(db_pool: Arc<PgPool>) -> Self {
41 Self {
42 db_pool,
43 start_time: SystemTime::now(),
44 component_thresholds: HealthThresholds::default(),
45 }
46 }
47
48 pub fn with_thresholds(mut self, thresholds: HealthThresholds) -> Self {
49 self.component_thresholds = thresholds;
50 self
51 }
52
53 pub async fn check_system_health(&self) -> Result<SystemHealth> {
55 let start_check = Instant::now();
56 let mut components = HashMap::new();
57
58 let db_health = self.check_database_health().await;
60 components.insert("database".to_string(), db_health);
61
62 let memory_health = self.check_memory_health().await;
64 components.insert("memory_system".to_string(), memory_health);
65
66 let pool_health = self.check_connection_pool_health().await;
68 components.insert("connection_pool".to_string(), pool_health);
69
70 let system_health = self.check_system_resources().await;
72 components.insert("system_resources".to_string(), system_health);
73
74 let overall_status = self.determine_overall_status(&components);
76
77 let uptime = self
78 .start_time
79 .elapsed()
80 .unwrap_or_else(|_| Duration::from_secs(0))
81 .as_secs();
82
83 let memory_usage = self.get_memory_usage().await.unwrap_or(0);
84 let cpu_usage = self.get_cpu_usage().await.unwrap_or(0.0);
85
86 let health = SystemHealth {
87 status: overall_status,
88 timestamp: Utc::now(),
89 components,
90 uptime_seconds: uptime,
91 memory_usage_bytes: memory_usage,
92 cpu_usage_percent: cpu_usage,
93 };
94
95 let check_duration = start_check.elapsed().as_millis();
96 debug!("System health check completed in {}ms", check_duration);
97
98 Ok(health)
99 }
100
101 async fn check_database_health(&self) -> ComponentHealth {
103 let start = Instant::now();
104 let mut status = HealthStatus::Healthy;
105 let mut message = None;
106 let mut error_count = 0;
107
108 match sqlx::query("SELECT 1 as health_check")
110 .fetch_one(self.db_pool.as_ref())
111 .await
112 {
113 Ok(_) => {
114 debug!("Database connectivity check passed");
115 }
116 Err(e) => {
117 status = HealthStatus::Unhealthy;
118 message = Some(format!("Database connection failed: {e}"));
119 error_count += 1;
120 error!("Database health check failed: {}", e);
121 }
122 }
123
124 if status == HealthStatus::Healthy {
126 match sqlx::query("SELECT COUNT(*) FROM memories WHERE status = 'active'")
127 .fetch_one(self.db_pool.as_ref())
128 .await
129 {
130 Ok(_) => {
131 let response_time = start.elapsed().as_millis() as u64;
132 if response_time > self.component_thresholds.max_response_time_ms {
133 status = HealthStatus::Degraded;
134 message = Some(format!("Slow database response: {response_time}ms"));
135 warn!("Database response time degraded: {}ms", response_time);
136 }
137 }
138 Err(e) => {
139 status = HealthStatus::Degraded;
140 message = Some(format!("Database query performance issue: {e}"));
141 error_count += 1;
142 warn!("Database performance check failed: {}", e);
143 }
144 }
145 }
146
147 let response_time_ms = start.elapsed().as_millis() as u64;
148
149 ComponentHealth {
150 status,
151 message,
152 last_checked: Utc::now(),
153 response_time_ms: Some(response_time_ms),
154 error_count,
155 }
156 }
157
158 async fn check_memory_health(&self) -> ComponentHealth {
160 let start = Instant::now();
161 let mut status = HealthStatus::Healthy;
162 let mut message = None;
163 let mut error_count = 0;
164
165 match sqlx::query_as::<_, (String, i64)>(
167 "SELECT tier, COUNT(*) FROM memories WHERE status = 'active' GROUP BY tier",
168 )
169 .fetch_all(self.db_pool.as_ref())
170 .await
171 {
172 Ok(tier_counts) => {
173 let total: i64 = tier_counts.iter().map(|(_, count)| count).sum();
174
175 if let Some((_, working_count)) =
177 tier_counts.iter().find(|(tier, _)| tier == "working")
178 {
179 let working_ratio = *working_count as f64 / total as f64;
180 if working_ratio > 0.7 {
181 status = HealthStatus::Degraded;
183 message = Some(format!(
184 "Memory pressure detected: {:.1}% in working tier",
185 working_ratio * 100.0
186 ));
187 warn!(
188 "Memory pressure: {:.1}% of memories in working tier",
189 working_ratio * 100.0
190 );
191 }
192 }
193
194 info!(
195 "Memory tier distribution check passed: {} active memories",
196 total
197 );
198 }
199 Err(e) => {
200 status = HealthStatus::Degraded;
201 message = Some(format!("Memory tier check failed: {e}"));
202 error_count += 1;
203 warn!("Memory tier health check failed: {}", e);
204 }
205 }
206
207 match sqlx::query_scalar::<_, i64>(
209 "SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'migration_history' AND column_name = 'success'"
210 )
211 .fetch_one(self.db_pool.as_ref())
212 .await
213 {
214 Ok(column_exists) if column_exists > 0 => {
215 match sqlx::query_scalar::<_, i64>(
217 "SELECT COUNT(*) FROM migration_history WHERE success = false AND migrated_at > NOW() - INTERVAL '1 hour'"
218 )
219 .fetch_one(self.db_pool.as_ref())
220 .await
221 {
222 Ok(failure_count) => {
223 if failure_count > 10 {
224 status = HealthStatus::Degraded;
225 message = Some(format!("High migration failure rate: {failure_count} failures in last hour"));
226 warn!("High migration failure rate: {} failures in last hour", failure_count);
227 }
228 }
229 Err(e) => {
230 warn!("Failed to check migration failures: {}", e);
231 error_count += 1;
232 }
233 }
234 }
235 _ => {
236 }
238 }
239
240 let response_time_ms = start.elapsed().as_millis() as u64;
241
242 ComponentHealth {
243 status,
244 message,
245 last_checked: Utc::now(),
246 response_time_ms: Some(response_time_ms),
247 error_count,
248 }
249 }
250
251 async fn check_connection_pool_health(&self) -> ComponentHealth {
253 let start = Instant::now();
254 let mut status = HealthStatus::Healthy;
255 let mut message = None;
256
257 let pool_size = self.db_pool.size();
259 let idle_connections = self.db_pool.num_idle();
260 let max_size = 100; let utilization = if max_size > 0 {
263 ((pool_size as usize - idle_connections) as f64 / max_size as f64) * 100.0
264 } else {
265 0.0
266 };
267
268 if utilization > self.component_thresholds.max_connection_pool_utilization {
269 status = HealthStatus::Degraded;
270 message = Some(format!(
271 "High connection pool utilization: {utilization:.1}%"
272 ));
273 warn!("Connection pool utilization high: {:.1}%", utilization);
274 } else if utilization > 90.0 {
275 status = HealthStatus::Unhealthy;
276 message = Some(format!(
277 "Critical connection pool utilization: {utilization:.1}%"
278 ));
279 error!("Connection pool utilization critical: {:.1}%", utilization);
280 }
281
282 let response_time_ms = start.elapsed().as_millis() as u64;
283
284 info!(
285 "Connection pool health: {}/{} connections used ({:.1}% utilization)",
286 pool_size as usize - idle_connections,
287 max_size,
288 utilization
289 );
290
291 ComponentHealth {
292 status,
293 message,
294 last_checked: Utc::now(),
295 response_time_ms: Some(response_time_ms),
296 error_count: 0,
297 }
298 }
299
300 async fn check_system_resources(&self) -> ComponentHealth {
302 let start = Instant::now();
303 let mut status = HealthStatus::Healthy;
304 let mut message = None;
305
306 let memory_usage = self.get_memory_usage().await.unwrap_or(0);
307 let cpu_usage = self.get_cpu_usage().await.unwrap_or(0.0);
308
309 let memory_usage_mb = memory_usage / (1024 * 1024);
311 if memory_usage_mb > 1024 {
312 status = HealthStatus::Degraded;
314 message = Some(format!("High memory usage: {memory_usage_mb}MB"));
315 }
316
317 if cpu_usage > self.component_thresholds.max_cpu_usage_percent {
319 status = HealthStatus::Degraded;
320 let cpu_message = format!("High CPU usage: {cpu_usage:.1}%");
321 message = match message {
322 Some(existing) => Some(format!("{existing}; {cpu_message}")),
323 None => Some(cpu_message),
324 };
325 }
326
327 let response_time_ms = start.elapsed().as_millis() as u64;
328
329 ComponentHealth {
330 status,
331 message,
332 last_checked: Utc::now(),
333 response_time_ms: Some(response_time_ms),
334 error_count: 0,
335 }
336 }
337
338 fn determine_overall_status(
340 &self,
341 components: &HashMap<String, ComponentHealth>,
342 ) -> HealthStatus {
343 let mut has_unhealthy = false;
344 let mut has_degraded = false;
345
346 for (component_name, health) in components {
347 match health.status {
348 HealthStatus::Unhealthy => {
349 has_unhealthy = true;
350 error!(
351 "Component {} is unhealthy: {:?}",
352 component_name, health.message
353 );
354 }
355 HealthStatus::Degraded => {
356 has_degraded = true;
357 warn!(
358 "Component {} is degraded: {:?}",
359 component_name, health.message
360 );
361 }
362 HealthStatus::Healthy => {
363 debug!("Component {} is healthy", component_name);
364 }
365 }
366 }
367
368 if has_unhealthy {
369 HealthStatus::Unhealthy
370 } else if has_degraded {
371 HealthStatus::Degraded
372 } else {
373 HealthStatus::Healthy
374 }
375 }
376
377 async fn get_memory_usage(&self) -> Result<u64> {
379 Ok(512 * 1024 * 1024) }
383
384 async fn get_cpu_usage(&self) -> Result<f64> {
386 Ok(25.0) }
390}
391
392#[derive(Debug, Serialize, Deserialize)]
394pub struct SimpleHealthResponse {
395 pub status: String,
396 pub timestamp: String,
397 pub uptime_seconds: u64,
398}
399
400impl From<&SystemHealth> for SimpleHealthResponse {
401 fn from(health: &SystemHealth) -> Self {
402 Self {
403 status: match health.status {
404 HealthStatus::Healthy => "healthy".to_string(),
405 HealthStatus::Degraded => "degraded".to_string(),
406 HealthStatus::Unhealthy => "unhealthy".to_string(),
407 },
408 timestamp: health.timestamp.to_rfc3339(),
409 uptime_seconds: health.uptime_seconds,
410 }
411 }
412}
413
414#[cfg(test)]
415mod tests {
416 use super::*;
417
418 #[test]
419 fn test_health_thresholds_default() {
420 let thresholds = HealthThresholds::default();
421 assert_eq!(thresholds.max_response_time_ms, 1000);
422 assert_eq!(thresholds.max_error_rate, 0.05);
423 }
424
425 #[test]
426 fn test_simple_health_response_conversion() {
427 let health = SystemHealth {
428 status: HealthStatus::Healthy,
429 timestamp: Utc::now(),
430 components: HashMap::new(),
431 uptime_seconds: 3600,
432 memory_usage_bytes: 1024 * 1024,
433 cpu_usage_percent: 25.0,
434 };
435
436 let simple: SimpleHealthResponse = (&health).into();
437 assert_eq!(simple.status, "healthy");
438 assert_eq!(simple.uptime_seconds, 3600);
439 }
440}