mockforge_observability/
system_metrics.rs

1//! System metrics collection
2//!
3//! Provides background tasks for collecting system-level metrics including:
4//! - Memory usage
5//! - CPU usage
6//! - Thread count
7//! - Server uptime
8
9use std::time::{Duration, Instant};
10use tokio::time::interval;
11use tracing::{debug, error, warn};
12
13use crate::prometheus::MetricsRegistry;
14
15/// Start system metrics collection background task
16///
17/// This task periodically collects system metrics and updates the Prometheus registry.
18/// The collection interval is configurable (default: 15 seconds).
19///
20/// # Arguments
21/// * `registry` - The metrics registry to update
22/// * `collection_interval` - How often to collect metrics (default: 15 seconds)
23///
24/// # Example
25/// ```no_run
26/// use mockforge_observability::{get_global_registry, system_metrics::start_system_metrics_collector};
27/// use std::time::Duration;
28///
29/// #[tokio::main]
30/// async fn main() {
31///     let registry = get_global_registry();
32///     start_system_metrics_collector(registry, Duration::from_secs(15));
33/// }
34/// ```
35pub fn start_system_metrics_collector(
36    registry: &'static MetricsRegistry,
37    collection_interval: Duration,
38) -> tokio::task::JoinHandle<()> {
39    let start_time = Instant::now();
40
41    tokio::spawn(async move {
42        let mut ticker = interval(collection_interval);
43        debug!("System metrics collector started (interval: {:?})", collection_interval);
44
45        loop {
46            ticker.tick().await;
47
48            // Collect and update metrics
49            if let Err(e) = collect_and_update_metrics(registry, start_time).await {
50                error!("Failed to collect system metrics: {}", e);
51            }
52        }
53    })
54}
55
56/// Collect and update all system metrics
57async fn collect_and_update_metrics(
58    registry: &MetricsRegistry,
59    start_time: Instant,
60) -> Result<(), Box<dyn std::error::Error>> {
61    // Update uptime
62    let uptime_seconds = start_time.elapsed().as_secs_f64();
63    registry.update_uptime(uptime_seconds);
64
65    // Collect system information using sysinfo
66    #[cfg(feature = "sysinfo")]
67    {
68        use sysinfo::System;
69
70        let mut sys = System::new_all();
71        sys.refresh_all();
72
73        // Memory usage
74        let memory_used = sys.used_memory() as f64;
75        registry.update_memory_usage(memory_used);
76
77        // CPU usage (global)
78        let cpu_usage = sys.global_cpu_usage() as f64;
79        registry.update_cpu_usage(cpu_usage);
80
81        debug!(
82            "System metrics updated - Memory: {:.2} MB, CPU: {:.2}%, Uptime: {:.2}s",
83            memory_used / 1024.0 / 1024.0,
84            cpu_usage,
85            uptime_seconds
86        );
87    }
88
89    // Collect thread count
90    #[cfg(target_os = "linux")]
91    {
92        if let Ok(thread_count) = get_thread_count_linux() {
93            registry.update_thread_count(thread_count as f64);
94        }
95    }
96
97    #[cfg(not(target_os = "linux"))]
98    {
99        // For non-Linux systems, use a simple estimation based on available parallelism
100        if let Ok(parallelism) = std::thread::available_parallelism() {
101            // This is an approximation - actual thread count may vary
102            registry.update_thread_count(parallelism.get() as f64);
103        }
104    }
105
106    Ok(())
107}
108
109/// Get thread count on Linux by reading /proc/self/status
110#[cfg(target_os = "linux")]
111fn get_thread_count_linux() -> Result<usize, std::io::Error> {
112    use std::fs;
113
114    let status = fs::read_to_string("/proc/self/status")?;
115    for line in status.lines() {
116        if line.starts_with("Threads:") {
117            if let Some(count_str) = line.split_whitespace().nth(1) {
118                if let Ok(count) = count_str.parse::<usize>() {
119                    return Ok(count);
120                }
121            }
122        }
123    }
124
125    Err(std::io::Error::new(
126        std::io::ErrorKind::NotFound,
127        "Thread count not found in /proc/self/status",
128    ))
129}
130
131/// Configuration for system metrics collection
132#[derive(Debug, Clone)]
133pub struct SystemMetricsConfig {
134    /// Enable system metrics collection
135    pub enabled: bool,
136    /// Collection interval in seconds
137    pub interval_seconds: u64,
138}
139
140impl Default for SystemMetricsConfig {
141    fn default() -> Self {
142        Self {
143            enabled: true,
144            interval_seconds: 15,
145        }
146    }
147}
148
149/// Start system metrics collector with configuration
150pub fn start_with_config(
151    registry: &'static MetricsRegistry,
152    config: SystemMetricsConfig,
153) -> Option<tokio::task::JoinHandle<()>> {
154    if config.enabled {
155        debug!("Starting system metrics collector with {:?}", config);
156        Some(start_system_metrics_collector(
157            registry,
158            Duration::from_secs(config.interval_seconds),
159        ))
160    } else {
161        warn!("System metrics collection is disabled");
162        None
163    }
164}
165
166#[cfg(test)]
167mod tests {
168    use super::*;
169    use crate::prometheus::MetricsRegistry;
170
171    #[tokio::test]
172    async fn test_system_metrics_collection() {
173        let registry = MetricsRegistry::new();
174        let start_time = Instant::now();
175
176        // Run collection once
177        let result = collect_and_update_metrics(&registry, start_time).await;
178        assert!(result.is_ok());
179
180        // Verify metrics were updated
181        assert!(registry.uptime_seconds.get() > 0.0);
182    }
183
184    #[test]
185    fn test_system_metrics_config_default() {
186        let config = SystemMetricsConfig::default();
187        assert!(config.enabled);
188        assert_eq!(config.interval_seconds, 15);
189    }
190
191    #[cfg(target_os = "linux")]
192    #[test]
193    fn test_get_thread_count_linux() {
194        let result = get_thread_count_linux();
195        assert!(result.is_ok());
196        let count = result.unwrap();
197        assert!(count > 0);
198    }
199}