halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Health Check - Kubernetes/Cloud Run liveness and readiness probes
//!
//! Provides health check functionality for cloud deployments.
//!
//! ## Usage
//!
//! ```rust,ignore
//! let health = HealthCheck::new(&orchestrator);
//! 
//! // For liveness probe (is the process alive?)
//! let liveness = health.liveness();
//! // For readiness probe (can we accept traffic?)
//! let readiness = health.readiness();
//! 
//! // JSON output for HTTP endpoint
//! let json = health.to_json();
//! ```

use std::time::Instant;
use serde::{Deserialize, Serialize};

/// Health check status
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum HealthStatus {
    /// Service is healthy
    Healthy,
    /// Service is degraded but operational
    Degraded,
    /// Service is unhealthy
    Unhealthy,
}

impl HealthStatus {
    /// Convert to HTTP status code
    pub fn http_status_code(&self) -> u16 {
        match self {
            HealthStatus::Healthy => 200,
            HealthStatus::Degraded => 200, // Still accept traffic
            HealthStatus::Unhealthy => 503,
        }
    }

    /// Is the service operational?
    pub fn is_operational(&self) -> bool {
        matches!(self, HealthStatus::Healthy | HealthStatus::Degraded)
    }
}

/// Component health
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ComponentHealth {
    /// Component name
    pub name: String,
    /// Status
    pub status: HealthStatus,
    /// Optional message
    #[serde(skip_serializing_if = "Option::is_none")]
    pub message: Option<String>,
    /// Response time (ms)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub response_time_ms: Option<u64>,
}

/// Health check response
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthResponse {
    /// Overall status
    pub status: HealthStatus,
    /// Version
    pub version: String,
    /// Uptime in seconds
    pub uptime_secs: u64,
    /// Component checks
    pub checks: Vec<ComponentHealth>,
    /// Timestamp (ISO 8601)
    pub timestamp: String,
}

/// Health checker configuration
#[derive(Debug, Clone)]
pub struct HealthCheckConfig {
    /// Success rate threshold for healthy (0.0 - 1.0)
    pub success_rate_healthy: f64,
    /// Success rate threshold for degraded (0.0 - 1.0)
    pub success_rate_degraded: f64,
    /// Max average latency for healthy (ms)
    pub max_latency_healthy_ms: u64,
    /// Max average latency for degraded (ms)
    pub max_latency_degraded_ms: u64,
    /// Max open circuits for healthy
    pub max_open_circuits_healthy: usize,
    /// Max open circuits for degraded
    pub max_open_circuits_degraded: usize,
}

impl Default for HealthCheckConfig {
    fn default() -> Self {
        Self {
            success_rate_healthy: 0.95,
            success_rate_degraded: 0.80,
            max_latency_healthy_ms: 5000,
            max_latency_degraded_ms: 15000,
            max_open_circuits_healthy: 2,
            max_open_circuits_degraded: 10,
        }
    }
}

/// Health checker
pub struct HealthChecker {
    config: HealthCheckConfig,
    started_at: Instant,
    version: String,
}

impl HealthChecker {
    /// Create new health checker
    pub fn new(config: HealthCheckConfig) -> Self {
        Self {
            config,
            started_at: Instant::now(),
            version: env!("CARGO_PKG_VERSION").to_string(),
        }
    }

    /// Create with default config
    pub fn default_config() -> Self {
        Self::new(HealthCheckConfig::default())
    }

    /// Liveness check - is the process alive?
    /// This should almost always return healthy unless the process is stuck
    pub fn liveness(&self) -> HealthResponse {
        HealthResponse {
            status: HealthStatus::Healthy,
            version: self.version.clone(),
            uptime_secs: self.started_at.elapsed().as_secs(),
            checks: vec![ComponentHealth {
                name: "process".to_string(),
                status: HealthStatus::Healthy,
                message: Some("Process is running".to_string()),
                response_time_ms: None,
            }],
            timestamp: chrono::Utc::now().to_rfc3339(),
        }
    }

    /// Readiness check - can we accept traffic?
    pub fn readiness(&self, metrics: &HealthMetrics) -> HealthResponse {
        let mut checks = Vec::new();
        let mut overall_status = HealthStatus::Healthy;

        // Check success rate
        let success_check = self.check_success_rate(metrics.success_rate);
        if success_check.status == HealthStatus::Unhealthy {
            overall_status = HealthStatus::Unhealthy;
        } else if success_check.status == HealthStatus::Degraded && overall_status == HealthStatus::Healthy {
            overall_status = HealthStatus::Degraded;
        }
        checks.push(success_check);

        // Check latency
        let latency_check = self.check_latency(metrics.avg_latency_ms);
        if latency_check.status == HealthStatus::Unhealthy {
            overall_status = HealthStatus::Unhealthy;
        } else if latency_check.status == HealthStatus::Degraded && overall_status == HealthStatus::Healthy {
            overall_status = HealthStatus::Degraded;
        }
        checks.push(latency_check);

        // Check circuit breakers
        let circuit_check = self.check_circuits(metrics.open_circuits);
        if circuit_check.status == HealthStatus::Unhealthy {
            overall_status = HealthStatus::Unhealthy;
        } else if circuit_check.status == HealthStatus::Degraded && overall_status == HealthStatus::Healthy {
            overall_status = HealthStatus::Degraded;
        }
        checks.push(circuit_check);

        // Check memory (if available)
        if let Some(memory_mb) = metrics.memory_mb {
            checks.push(ComponentHealth {
                name: "memory".to_string(),
                status: HealthStatus::Healthy,
                message: Some(format!("{} MB used", memory_mb)),
                response_time_ms: None,
            });
        }

        HealthResponse {
            status: overall_status,
            version: self.version.clone(),
            uptime_secs: self.started_at.elapsed().as_secs(),
            checks,
            timestamp: chrono::Utc::now().to_rfc3339(),
        }
    }

    fn check_success_rate(&self, rate: f64) -> ComponentHealth {
        let (status, message) = if rate >= self.config.success_rate_healthy {
            (HealthStatus::Healthy, format!("{:.1}% success rate", rate * 100.0))
        } else if rate >= self.config.success_rate_degraded {
            (HealthStatus::Degraded, format!("{:.1}% success rate (degraded)", rate * 100.0))
        } else {
            (HealthStatus::Unhealthy, format!("{:.1}% success rate (critical)", rate * 100.0))
        };

        ComponentHealth {
            name: "success_rate".to_string(),
            status,
            message: Some(message),
            response_time_ms: None,
        }
    }

    fn check_latency(&self, latency_ms: f64) -> ComponentHealth {
        let (status, message) = if latency_ms <= self.config.max_latency_healthy_ms as f64 {
            (HealthStatus::Healthy, format!("{:.0}ms avg latency", latency_ms))
        } else if latency_ms <= self.config.max_latency_degraded_ms as f64 {
            (HealthStatus::Degraded, format!("{:.0}ms avg latency (high)", latency_ms))
        } else {
            (HealthStatus::Unhealthy, format!("{:.0}ms avg latency (critical)", latency_ms))
        };

        ComponentHealth {
            name: "latency".to_string(),
            status,
            message: Some(message),
            response_time_ms: Some(latency_ms as u64),
        }
    }

    fn check_circuits(&self, open_circuits: usize) -> ComponentHealth {
        let (status, message) = if open_circuits <= self.config.max_open_circuits_healthy {
            (HealthStatus::Healthy, format!("{} open circuits", open_circuits))
        } else if open_circuits <= self.config.max_open_circuits_degraded {
            (HealthStatus::Degraded, format!("{} open circuits (elevated)", open_circuits))
        } else {
            (HealthStatus::Unhealthy, format!("{} open circuits (critical)", open_circuits))
        };

        ComponentHealth {
            name: "circuit_breakers".to_string(),
            status,
            message: Some(message),
            response_time_ms: None,
        }
    }
}

/// Metrics for health check
#[derive(Debug, Clone, Default)]
pub struct HealthMetrics {
    /// Success rate (0.0 - 1.0)
    pub success_rate: f64,
    /// Average latency (ms)
    pub avg_latency_ms: f64,
    /// Number of open circuit breakers
    pub open_circuits: usize,
    /// Memory usage (MB) - optional
    pub memory_mb: Option<u64>,
    /// Active requests
    pub active_requests: usize,
}

impl HealthResponse {
    /// Convert to JSON
    pub fn to_json(&self) -> String {
        serde_json::to_string_pretty(self).unwrap_or_else(|_| "{}".to_string())
    }

    /// Convert to compact JSON
    pub fn to_json_compact(&self) -> String {
        serde_json::to_string(self).unwrap_or_else(|_| "{}".to_string())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_liveness_always_healthy() {
        let checker = HealthChecker::default_config();
        let response = checker.liveness();
        assert_eq!(response.status, HealthStatus::Healthy);
    }

    #[test]
    fn test_readiness_healthy() {
        let checker = HealthChecker::default_config();
        let metrics = HealthMetrics {
            success_rate: 0.99,
            avg_latency_ms: 100.0,
            open_circuits: 0,
            memory_mb: Some(256),
            active_requests: 5,
        };

        let response = checker.readiness(&metrics);
        assert_eq!(response.status, HealthStatus::Healthy);
    }

    #[test]
    fn test_readiness_degraded() {
        let checker = HealthChecker::default_config();
        let metrics = HealthMetrics {
            success_rate: 0.85,
            avg_latency_ms: 8000.0,
            open_circuits: 5,
            memory_mb: None,
            active_requests: 10,
        };

        let response = checker.readiness(&metrics);
        assert_eq!(response.status, HealthStatus::Degraded);
    }

    #[test]
    fn test_readiness_unhealthy() {
        let checker = HealthChecker::default_config();
        let metrics = HealthMetrics {
            success_rate: 0.50,
            avg_latency_ms: 20000.0,
            open_circuits: 20,
            memory_mb: None,
            active_requests: 0,
        };

        let response = checker.readiness(&metrics);
        assert_eq!(response.status, HealthStatus::Unhealthy);
    }

    #[test]
    fn test_json_output() {
        let checker = HealthChecker::default_config();
        let response = checker.liveness();
        let json = response.to_json();

        assert!(json.contains("\"status\""));
        assert!(json.contains("\"version\""));
        assert!(json.contains("\"uptime_secs\""));
    }

    #[test]
    fn test_http_status_codes() {
        assert_eq!(HealthStatus::Healthy.http_status_code(), 200);
        assert_eq!(HealthStatus::Degraded.http_status_code(), 200);
        assert_eq!(HealthStatus::Unhealthy.http_status_code(), 503);
    }
}