inklog 0.1.0 - Docs.rs

// Copyright (c) 2026 Kirky.X
//
// Licensed under the MIT License
// See LICENSE file in the project root for full license information.

//! # 健康监控模块
//!
//! 提供 Inklog 的健康监控和指标收集功能，支持 Prometheus 格式导出。
//!
//! ## 概述
//!
//! 此模块包含：
//! - **SinkStatus/SinkHealth**：Sink 组件健康状态跟踪
//! - **Metrics**：核心指标收集器
//! - **Prometheus 导出**：HTTP 端点可读格式
//!
// ## 功能特性
//!
//! - **实时健康检查**：跟踪各 sink 的运行状态
//! - **指标收集**：记录日志写入、错误、延迟等
//! - **直方图统计**：延迟分布统计
//! - **Prometheus 兼容**：可直接与 Prometheus 集成
//!
// ## 使用示例
//!
//! ```rust
//! use inklog::metrics::{Metrics, SinkHealth, SinkStatus};
//!
//! let metrics = Metrics::new();
//!
//! // 记录日志写入
//! metrics.inc_logs_written();
//!
//! // 记录错误
//! metrics.inc_sink_error();
//!
//! // 更新 Sink 健康状态
//! metrics.update_sink_health("console", true, None);
//!
//! // 获取整体健康状态
//! let health = metrics.get_status(0, 10000);
//! ```
//!
//! ## Prometheus 指标
//!
//! | 指标 | 类型 | 描述 |
//! |------|------|------|
//! | `inklog_records_total` | Counter | 总日志记录数 |
//! | `inklog_errors_total` | Counter | 总错误数 |
//! | `inklog_latency_us` | Histogram | 处理延迟（微秒）|
//! | `inklog_sink_healthy` | Gauge | Sink 健康状态 |
//! | `inklog_uptime_seconds` | Gauge | 运行时间（秒）|

use serde::Serialize;
use std::collections::HashMap;
use std::sync::atomic::{AtomicI64, AtomicU64, Ordering};
use std::sync::Mutex;
use std::time::{Duration, Instant};

/// Represents the health status of a sink component.
///
/// This enum provides more granular status information than a simple boolean,
/// allowing for better observability and debugging.
#[derive(Debug, Serialize, Clone, PartialEq, Default)]
pub enum SinkStatus {
    /// Sink is operating normally
    Healthy,
    /// Sink is degraded but still functioning
    Degraded { reason: String },
    /// Sink has failed and is not functioning
    Unhealthy { error: String },
    #[default]
    /// Sink has not been initialized yet
    NotStarted,
}

impl SinkStatus {
    /// Returns true if the sink is operational (healthy or degraded but functional)
    pub fn is_operational(&self) -> bool {
        match self {
            SinkStatus::Healthy => true,
            SinkStatus::Degraded { .. } => true,
            SinkStatus::Unhealthy { .. } => false,
            SinkStatus::NotStarted => false,
        }
    }

    /// Returns true if the sink is completely healthy with no issues
    fn is_fully_healthy(&self) -> bool {
        self == &SinkStatus::Healthy
    }
}

#[derive(Debug, Serialize, Clone)]
pub struct SinkHealth {
    pub status: SinkStatus,
    pub last_error: Option<String>,
    pub consecutive_failures: u32,
}

impl Default for SinkHealth {
    fn default() -> Self {
        Self {
            status: SinkStatus::NotStarted,
            last_error: None,
            consecutive_failures: 0,
        }
    }
}

impl SinkHealth {
    /// Creates a healthy sink status
    pub fn healthy() -> Self {
        Self {
            status: SinkStatus::Healthy,
            last_error: None,
            consecutive_failures: 0,
        }
    }

    /// Creates an unhealthy sink status with the given error
    pub fn unhealthy(error: String) -> Self {
        Self {
            status: SinkStatus::Unhealthy {
                error: error.clone(),
            },
            last_error: Some(error),
            consecutive_failures: 1,
        }
    }
}

/// Gauge metric for atomic counter values
#[derive(Debug)]
pub struct Gauge {
    value: AtomicI64,
}

impl Gauge {
    pub fn new(val: i64) -> Self {
        Self {
            value: AtomicI64::new(val),
        }
    }
    pub fn set(&self, v: i64) {
        self.value.store(v, Ordering::Relaxed);
    }
    pub fn get(&self) -> i64 {
        self.value.load(Ordering::Relaxed)
    }
    pub fn inc(&self) {
        self.value.fetch_add(1, Ordering::Relaxed);
    }
    pub fn dec(&self) {
        self.value.fetch_sub(1, Ordering::Relaxed);
    }
}

/// Histogram metric for latency distribution
#[derive(Debug)]
pub struct Histogram {
    buckets: Vec<AtomicU64>,
    bounds: Vec<u64>, // in microseconds
}

impl Histogram {
    pub fn new(bounds: Vec<u64>) -> Self {
        let mut buckets = Vec::with_capacity(bounds.len() + 1);
        for _ in 0..=bounds.len() {
            buckets.push(AtomicU64::new(0));
        }
        Self { buckets, bounds }
    }

    pub fn record(&self, value: u64) {
        let mut index = self.bounds.len();
        for (i, &bound) in self.bounds.iter().enumerate() {
            if value < bound {
                index = i;
                break;
            }
        }
        self.buckets[index].fetch_add(1, Ordering::Relaxed);
    }

    pub fn snapshot(&self) -> Vec<u64> {
        self.buckets
            .iter()
            .map(|b| b.load(Ordering::Relaxed))
            .collect()
    }
}

#[derive(Debug, Serialize)]
pub struct MetricsSnapshot {
    pub logs_written: u64,
    pub logs_dropped: u64,
    pub channel_blocked: u64,
    pub sink_errors: u64,
    pub avg_latency_us: u64,
    pub latency_distribution: Vec<u64>,
    pub active_workers: i64,
}

#[derive(Debug, Serialize)]
pub struct HealthStatus {
    /// Overall health level (derived from individual sink statuses)
    pub overall_status: SinkStatus,
    pub sinks: HashMap<String, SinkHealth>,
    pub channel_usage: f64,
    pub uptime_seconds: u64,
    pub metrics: MetricsSnapshot,
}

/// Health monitoring metrics collector.
///
/// This struct provides the following accessor methods for reading counter values:
/// - [`logs_written()`](struct.Metrics.html#method.logs_written) - Total logs successfully written
/// - [`logs_dropped()`](struct.Metrics.html#method.logs_dropped) - Total logs dropped
/// - [`channel_blocked()`](struct.Metrics.html#method.channel_blocked) - Total channel blocking events
/// - [`sink_errors()`](struct.Metrics.html#method.sink_errors) - Total sink errors
///
/// # Example
///
/// ```rust
/// use inklog::Metrics;
///
/// let metrics = Metrics::new();
/// metrics.inc_logs_written();
/// assert_eq!(metrics.logs_written(), 1);
/// ```
#[derive(Debug)]
pub struct Metrics {
    pub(crate) logs_written_total: AtomicU64,
    pub(crate) logs_dropped_total: AtomicU64,
    pub(crate) channel_send_blocked_total: AtomicU64,
    pub(crate) sink_errors_total: AtomicU64,
    pub(crate) start_time: Instant,

    // Latency tracking
    pub(crate) total_latency_us: AtomicU64,
    pub(crate) latency_count: AtomicU64,
    pub(crate) latency_histogram: Histogram,

    // Gauges
    pub(crate) active_workers: Gauge,

    // Sink Health
    pub(crate) sink_health: Mutex<HashMap<String, SinkHealth>>,
}

impl Default for Metrics {
    fn default() -> Self {
        // Default buckets: 1ms, 5ms, 10ms, 50ms, 100ms, 500ms, 1s
        let bounds = vec![1000, 5000, 10000, 50000, 100000, 500000, 1000000];
        Self {
            logs_written_total: AtomicU64::new(0),
            logs_dropped_total: AtomicU64::new(0),
            channel_send_blocked_total: AtomicU64::new(0),
            sink_errors_total: AtomicU64::new(0),
            start_time: Instant::now(),
            total_latency_us: AtomicU64::new(0),
            latency_count: AtomicU64::new(0),
            latency_histogram: Histogram::new(bounds),
            active_workers: Gauge::new(0),
            sink_health: Mutex::new(HashMap::new()),
        }
    }
}

impl Metrics {
    pub fn new() -> Self {
        Self::default()
    }

    /// Audit helper for internal state access logging.
    /// Only logs when tracing is at debug level or lower.
    #[inline]
    fn audit_access(&self, field: &str) {
        tracing::debug!(event = "internal_state_access", field = field,);
    }

    /// Returns the total number of logs successfully written.
    pub fn logs_written(&self) -> u64 {
        self.logs_written_total.load(Ordering::Relaxed)
    }

    /// Returns the total number of logs dropped.
    pub fn logs_dropped(&self) -> u64 {
        self.logs_dropped_total.load(Ordering::Relaxed)
    }

    /// Returns the total number of times the channel was blocked.
    pub fn channel_blocked(&self) -> u64 {
        self.channel_send_blocked_total.load(Ordering::Relaxed)
    }

    /// Returns the total number of sink errors.
    pub fn sink_errors(&self) -> u64 {
        self.sink_errors_total.load(Ordering::Relaxed)
    }

    /// Returns the number of active workers (with audit logging).
    pub fn active_workers(&self) -> i64 {
        self.audit_access("active_workers");
        self.active_workers.get()
    }

    /// Returns the sink health status map (with audit logging).
    pub fn sink_health(&self) -> std::collections::HashMap<String, SinkHealth> {
        self.audit_access("sink_health");
        match self.sink_health.lock() {
            Ok(guard) => guard.clone(),
            Err(_) => std::collections::HashMap::new(),
        }
    }

    /// Returns the uptime duration.
    pub fn uptime(&self) -> Duration {
        self.start_time.elapsed()
    }

    pub fn inc_logs_written(&self) {
        self.logs_written_total.fetch_add(1, Ordering::Relaxed);
    }

    pub fn inc_logs_dropped(&self) {
        self.logs_dropped_total.fetch_add(1, Ordering::Relaxed);
    }

    pub fn inc_channel_blocked(&self) {
        self.channel_send_blocked_total
            .fetch_add(1, Ordering::Relaxed);
    }

    pub fn inc_sink_error(&self) {
        self.sink_errors_total.fetch_add(1, Ordering::Relaxed);
    }

    pub fn record_latency(&self, duration: Duration) {
        let micros = duration.as_micros() as u64;
        self.total_latency_us.fetch_add(micros, Ordering::Relaxed);
        self.latency_count.fetch_add(1, Ordering::Relaxed);
        self.latency_histogram.record(micros);
    }

    /// Updates the health status of a sink component.
    ///
    /// # Arguments
    /// * `name` - The name of the sink
    /// * `healthy` - Whether the sink is healthy
    /// * `error` - Optional error message if the sink is unhealthy
    pub fn update_sink_health(&self, name: &str, healthy: bool, error: Option<String>) {
        // 减少锁持有时间：在锁外准备状态
        let status = if healthy {
            SinkStatus::Healthy
        } else {
            let error_msg = error
                .as_ref()
                .unwrap_or(&"Unknown error".to_string())
                .clone();
            SinkStatus::Unhealthy { error: error_msg }
        };

        let (new_failures, new_error) = if healthy {
            (0, None)
        } else {
            // 需要获取当前失败次数，所以需要先读取
            let current_failures = if let Ok(map) = self.sink_health.lock() {
                map.get(name).map(|h| h.consecutive_failures).unwrap_or(0)
            } else {
                0
            };
            (current_failures + 1, error)
        };

        // 现在快速更新
        if let Ok(mut map) = self.sink_health.lock() {
            let entry = map
                .entry(name.to_string())
                .or_insert_with(SinkHealth::healthy);
            entry.status = status;
            entry.consecutive_failures = new_failures;
            entry.last_error = new_error;
        }
    }

    /// Reports that a sink has started (transitions from NotStarted to Healthy)
    pub fn sink_started(&self, name: &str) {
        if let Ok(mut map) = self.sink_health.lock() {
            let entry = map.entry(name.to_string()).or_insert(SinkHealth::healthy());
            entry.status = SinkStatus::Healthy;
            entry.consecutive_failures = 0;
            entry.last_error = None;
        }
    }

    /// Reports that a sink has degraded but is still operational
    pub fn sink_degraded(&self, name: &str, reason: String) {
        if let Ok(mut map) = self.sink_health.lock() {
            let entry = map.entry(name.to_string()).or_insert(SinkHealth::healthy());
            entry.status = SinkStatus::Degraded {
                reason: reason.clone(),
            };
            entry.last_error = Some(reason);
        }
    }

    pub fn get_status(&self, channel_len: usize, channel_cap: usize) -> HealthStatus {
        let sinks: std::collections::HashMap<String, SinkHealth> = match self.sink_health.lock() {
            Ok(guard) => guard.clone(),
            Err(_e) => {
                eprintln!("Metrics mutex poisoned, using empty data");
                std::collections::HashMap::new()
            }
        };

        // Determine overall status based on sink statuses
        let overall_status = if sinks.is_empty() {
            SinkStatus::NotStarted
        } else {
            let all_healthy = sinks.values().all(|s| s.status.is_fully_healthy());
            let any_unhealthy = sinks
                .values()
                .any(|s| matches!(s.status, SinkStatus::Unhealthy { .. }));
            let any_degraded = sinks
                .values()
                .any(|s| matches!(s.status, SinkStatus::Degraded { .. }));

            if all_healthy {
                SinkStatus::Healthy
            } else if any_unhealthy {
                let errors: Vec<String> = sinks
                    .values()
                    .filter_map(|s| {
                        if let SinkStatus::Unhealthy { error } = &s.status {
                            Some(error.clone())
                        } else {
                            None
                        }
                    })
                    .collect();
                SinkStatus::Unhealthy {
                    error: errors.join("; "),
                }
            } else if any_degraded {
                let reasons: Vec<String> = sinks
                    .values()
                    .filter_map(|s| {
                        if let SinkStatus::Degraded { reason } = &s.status {
                            Some(reason.clone())
                        } else {
                            None
                        }
                    })
                    .collect();
                SinkStatus::Degraded {
                    reason: reasons.join("; "),
                }
            } else {
                SinkStatus::Healthy
            }
        };

        let count = self.latency_count.load(Ordering::Relaxed);
        let total = self.total_latency_us.load(Ordering::Relaxed);
        let avg_latency = if count > 0 { total / count } else { 0 };

        HealthStatus {
            overall_status,
            sinks,
            channel_usage: if channel_cap > 0 {
                channel_len as f64 / channel_cap as f64
            } else {
                0.0
            },
            uptime_seconds: self.uptime().as_secs(),
            metrics: MetricsSnapshot {
                logs_written: self.logs_written_total.load(Ordering::Relaxed),
                logs_dropped: self.logs_dropped_total.load(Ordering::Relaxed),
                channel_blocked: self.channel_send_blocked_total.load(Ordering::Relaxed),
                sink_errors: self.sink_errors_total.load(Ordering::Relaxed),
                avg_latency_us: avg_latency,
                latency_distribution: self.latency_histogram.snapshot(),
                active_workers: self.active_workers.get(),
            },
        }
    }

    pub fn export_prometheus(&self) -> String {
        let mut s = String::new();
        s.push_str("# HELP inklog_logs_written_total Total logs successfully written\n");
        s.push_str("# TYPE inklog_logs_written_total counter\n");
        s.push_str(&format!(
            "inklog_logs_written_total {}\n",
            self.logs_written_total.load(Ordering::Relaxed)
        ));

        s.push_str("# HELP inklog_logs_dropped_total Total logs dropped\n");
        s.push_str("# TYPE inklog_logs_dropped_total counter\n");
        s.push_str(&format!(
            "inklog_logs_dropped_total {}\n",
            self.logs_dropped_total.load(Ordering::Relaxed)
        ));

        s.push_str("# HELP inklog_channel_blocked_total Total times channel was blocked\n");
        s.push_str("# TYPE inklog_channel_blocked_total counter\n");
        s.push_str(&format!(
            "inklog_channel_blocked_total {}\n",
            self.channel_send_blocked_total.load(Ordering::Relaxed)
        ));

        s.push_str("# HELP inklog_sink_errors_total Total sink errors\n");
        s.push_str("# TYPE inklog_sink_errors_total counter\n");
        s.push_str(&format!(
            "inklog_sink_errors_total {}\n",
            self.sink_errors_total.load(Ordering::Relaxed)
        ));

        s.push_str("# HELP inklog_active_workers Current active worker threads\n");
        s.push_str("# TYPE inklog_active_workers gauge\n");
        s.push_str(&format!(
            "inklog_active_workers {}\n",
            self.active_workers.get()
        ));

        //
        let count = self.latency_count.load(Ordering::Relaxed);
        let total = self.total_latency_us.load(Ordering::Relaxed);
        let avg_latency = if count > 0 { total / count } else { 0 };

        s.push_str("# HELP inklog_avg_latency_us Average log processing latency in microseconds\n");
        s.push_str("# TYPE inklog_avg_latency_us gauge\n");
        s.push_str(&format!("inklog_avg_latency_us {}\n", avg_latency));

        //
        let uptime = self.uptime().as_secs();
        if uptime > 0 {
            s.push_str("# HELP inklog_uptime_seconds Uptime in seconds\n");
            s.push_str("# TYPE inklog_uptime_seconds gauge\n");
            s.push_str(&format!("inklog_uptime_seconds {}\n", uptime));
        }

        //
        s.push_str("# HELP inklog_sink_healthy Sink health status (1=healthy, 0=unhealthy)\n");
        s.push_str("# TYPE inklog_sink_healthy gauge\n");
        if let Ok(health_map) = self.sink_health.lock() {
            for (name, health) in health_map.iter() {
                let value = if health.status.is_operational() { 1 } else { 0 };
                s.push_str(&format!(
                    "inklog_sink_healthy{{sink=\"{}\"}} {}\n",
                    name, value
                ));
            }
        }

        //
        s.push_str("# HELP inklog_latency_bucket Latency histogram bucket\n");
        s.push_str("# TYPE inklog_latency_bucket counter\n");
        let bounds = [1000, 5000, 10000, 50000, 100000, 500000, 1000000];
        let buckets = self.latency_histogram.snapshot();
        for (i, &bound) in bounds.iter().enumerate() {
            if i < buckets.len() {
                s.push_str(&format!(
                    "inklog_latency_bucket{{le=\"{}\"}} {}\n",
                    bound, buckets[i]
                ));
            }
        }
        //
        let total_count: u64 = buckets.iter().sum();
        s.push_str(&format!(
            "inklog_latency_bucket{{le=\"+Inf\"}} {}\n",
            total_count
        ));

        s
    }
}