saorsa_core/health/
metrics.rs

1// Copyright 2024 Saorsa Labs Limited
2//
3// This software is dual-licensed under:
4// - GNU Affero General Public License v3.0 or later (AGPL-3.0-or-later)
5// - Commercial License
6//
7// For AGPL-3.0 license, see LICENSE-AGPL-3.0
8// For commercial licensing, contact: saorsalabs@gmail.com
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under these licenses is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
14//! Prometheus metrics export for P2P health monitoring
15
16use super::{HealthManager, HealthStatus};
17use crate::Result;
18use std::fmt::Write;
19use std::sync::Arc;
20use std::time::{SystemTime, UNIX_EPOCH};
21
22/// Health metrics for Prometheus export
23pub struct HealthMetrics {
24    /// Node uptime in seconds
25    pub uptime_seconds: f64,
26    /// Number of healthy components
27    pub healthy_components: u64,
28    /// Number of degraded components
29    pub degraded_components: u64,
30    /// Number of unhealthy components
31    pub unhealthy_components: u64,
32    /// Total number of components
33    pub total_components: u64,
34    /// Network peer count
35    pub network_peer_count: u64,
36    /// DHT routing table size
37    pub dht_routing_table_size: u64,
38    /// Active connections
39    pub active_connections: u64,
40    /// Memory usage in bytes
41    pub memory_usage_bytes: u64,
42    /// CPU usage percentage
43    pub cpu_usage_percent: f64,
44    /// Bandwidth usage in bytes per second
45    pub bandwidth_usage_bps: u64,
46    /// Storage free space in bytes
47    pub storage_free_bytes: u64,
48    /// DHT operations per second
49    pub dht_ops_per_second: f64,
50}
51
52/// Prometheus exporter for health metrics
53pub struct PrometheusExporter {
54    health_manager: Arc<HealthManager>,
55}
56
57impl PrometheusExporter {
58    /// Create a new Prometheus exporter
59    pub fn new(health_manager: Arc<HealthManager>) -> Self {
60        Self { health_manager }
61    }
62
63    /// Export metrics in Prometheus format
64    pub async fn export(&self) -> Result<String> {
65        let health = self.health_manager.get_health().await?;
66        let debug_info = self.health_manager.get_debug_info().await?;
67
68        let mut output = String::with_capacity(4096);
69
70        // Node info
71        writeln!(
72            &mut output,
73            "# HELP p2p_node_info Node information\n# TYPE p2p_node_info gauge\np2p_node_info{{version=\"{}\",os=\"{}\",arch=\"{}\"}} 1",
74            health.version,
75            debug_info.system.os,
76            debug_info.system.arch
77        ).map_err(|e| crate::P2PError::Internal(format!("Failed to write metrics: {}", e).into()))?;
78
79        // Uptime
80        writeln!(
81            &mut output,
82            "\n# HELP p2p_uptime_seconds Node uptime in seconds\n# TYPE p2p_uptime_seconds counter\np2p_uptime_seconds {}",
83            health.uptime.as_secs_f64()
84        ).map_err(|e| crate::P2PError::Internal(format!("Failed to write metrics: {}", e).into()))?;
85
86        // Component health status
87        let mut healthy = 0u64;
88        let mut degraded = 0u64;
89        let mut unhealthy = 0u64;
90
91        for component in health.checks.values() {
92            match component.status {
93                HealthStatus::Healthy => healthy += 1,
94                HealthStatus::Degraded => degraded += 1,
95                HealthStatus::Unhealthy => unhealthy += 1,
96            }
97        }
98
99        writeln!(
100            &mut output,
101            "\n# HELP p2p_health_status Health status of components (1=healthy, 0=unhealthy)\n# TYPE p2p_health_status gauge"
102        ).map_err(|e| crate::P2PError::Internal(format!("Failed to write metrics: {}", e).into()))?;
103
104        for (name, component) in &health.checks {
105            let value = match component.status {
106                HealthStatus::Healthy => 1,
107                HealthStatus::Degraded => 0, // Could use 0.5 for degraded
108                HealthStatus::Unhealthy => 0,
109            };
110            writeln!(
111                &mut output,
112                "p2p_health_status{{component=\"{}\"}} {}",
113                name, value
114            )
115            .map_err(|e| {
116                crate::P2PError::Internal(format!("Failed to write metrics: {}", e).into())
117            })?;
118        }
119
120        // Component latency
121        writeln!(
122            &mut output,
123            "\n# HELP p2p_health_check_latency_ms Health check latency in milliseconds\n# TYPE p2p_health_check_latency_ms gauge"
124        ).map_err(|e| crate::P2PError::Internal(format!("Failed to write metrics: {}", e).into()))?;
125
126        for (name, component) in &health.checks {
127            writeln!(
128                &mut output,
129                "p2p_health_check_latency_ms{{component=\"{}\"}} {}",
130                name, component.latency_ms
131            )
132            .map_err(|e| {
133                crate::P2PError::Internal(format!("Failed to write metrics: {}", e).into())
134            })?;
135        }
136
137        // Summary metrics
138        writeln!(
139            &mut output,
140            "\n# HELP p2p_healthy_components Number of healthy components\n# TYPE p2p_healthy_components gauge\np2p_healthy_components {}",
141            healthy
142        ).map_err(|e| crate::P2PError::Internal(format!("Failed to write metrics: {}", e).into()))?;
143
144        writeln!(
145            &mut output,
146            "\n# HELP p2p_degraded_components Number of degraded components\n# TYPE p2p_degraded_components gauge\np2p_degraded_components {}",
147            degraded
148        ).map_err(|e| crate::P2PError::Internal(format!("Failed to write metrics: {}", e).into()))?;
149
150        writeln!(
151            &mut output,
152            "\n# HELP p2p_unhealthy_components Number of unhealthy components\n# TYPE p2p_unhealthy_components gauge\np2p_unhealthy_components {}",
153            unhealthy
154        ).map_err(|e| crate::P2PError::Internal(format!("Failed to write metrics: {}", e).into()))?;
155
156        // System metrics
157        writeln!(
158            &mut output,
159            "\n# HELP p2p_system_cpu_count Number of CPU cores\n# TYPE p2p_system_cpu_count gauge\np2p_system_cpu_count {}",
160            debug_info.system.cpu_count
161        ).map_err(|e| crate::P2PError::Internal(format!("Failed to write metrics: {}", e).into()))?;
162
163        writeln!(
164            &mut output,
165            "\n# HELP p2p_system_memory_total_bytes Total system memory in bytes\n# TYPE p2p_system_memory_total_bytes gauge\np2p_system_memory_total_bytes {}",
166            debug_info.system.total_memory
167        ).map_err(|e| crate::P2PError::Internal(format!("Failed to write metrics: {}", e).into()))?;
168
169        writeln!(
170            &mut output,
171            "\n# HELP p2p_system_memory_available_bytes Available system memory in bytes\n# TYPE p2p_system_memory_available_bytes gauge\np2p_system_memory_available_bytes {}",
172            debug_info.system.available_memory
173        ).map_err(|e| crate::P2PError::Internal(format!("Failed to write metrics: {}", e).into()))?;
174
175        // Runtime metrics
176        writeln!(
177            &mut output,
178            "\n# HELP p2p_runtime_threads Number of runtime threads\n# TYPE p2p_runtime_threads gauge\np2p_runtime_threads {}",
179            debug_info.runtime.thread_count
180        ).map_err(|e| crate::P2PError::Internal(format!("Failed to write metrics: {}", e).into()))?;
181
182        writeln!(
183            &mut output,
184            "\n# HELP p2p_runtime_memory_usage_bytes Runtime memory usage in bytes\n# TYPE p2p_runtime_memory_usage_bytes gauge\np2p_runtime_memory_usage_bytes {}",
185            debug_info.runtime.memory_usage
186        ).map_err(|e| crate::P2PError::Internal(format!("Failed to write metrics: {}", e).into()))?;
187
188        // Extract component-specific metrics from metadata
189        for (name, component) in &health.checks {
190            for (key, value) in &component.metadata {
191                if let Some(num) = value.as_u64() {
192                    writeln!(
193                        &mut output,
194                        "\n# HELP p2p_{}_{} Component-specific metric\n# TYPE p2p_{}_{} gauge\np2p_{}_{} {}",
195                        name, key, name, key, name, key, num
196                    ).map_err(|e| crate::P2PError::Internal(format!("Failed to write metrics: {}", e).into()))?;
197                } else if let Some(num) = value.as_f64() {
198                    writeln!(
199                        &mut output,
200                        "\n# HELP p2p_{}_{} Component-specific metric\n# TYPE p2p_{}_{} gauge\np2p_{}_{} {}",
201                        name, key, name, key, name, key, num
202                    ).map_err(|e| crate::P2PError::Internal(format!("Failed to write metrics: {}", e).into()))?;
203                }
204            }
205        }
206
207        // Last scrape timestamp
208        let timestamp = SystemTime::now()
209            .duration_since(UNIX_EPOCH)
210            .map_err(|e| {
211                crate::P2PError::Internal(format!("Failed to get timestamp: {}", e).into())
212            })?
213            .as_secs();
214
215        writeln!(
216            &mut output,
217            "\n# HELP p2p_last_scrape_timestamp_seconds Unix timestamp of last scrape\n# TYPE p2p_last_scrape_timestamp_seconds gauge\np2p_last_scrape_timestamp_seconds {}",
218            timestamp
219        ).map_err(|e| crate::P2PError::Internal(format!("Failed to write metrics: {}", e).into()))?;
220
221        Ok(output)
222    }
223
224    /// Export metrics as a structured object
225    pub async fn export_metrics(&self) -> Result<HealthMetrics> {
226        let health = self.health_manager.get_health().await?;
227        let debug_info = self.health_manager.get_debug_info().await?;
228
229        let mut healthy = 0u64;
230        let mut degraded = 0u64;
231        let mut unhealthy = 0u64;
232
233        for component in health.checks.values() {
234            match component.status {
235                HealthStatus::Healthy => healthy += 1,
236                HealthStatus::Degraded => degraded += 1,
237                HealthStatus::Unhealthy => unhealthy += 1,
238            }
239        }
240
241        // Extract metrics from component metadata
242        let mut network_peer_count = 0u64;
243        let mut dht_routing_table_size = 0u64;
244        let mut active_connections = 0u64;
245        let mut dht_ops_per_second = 0.0;
246        let mut bandwidth_usage_bps = 0u64;
247        let mut storage_free_bytes = 0u64;
248
249        for (name, component) in &health.checks {
250            match name.as_str() {
251                "network" => {
252                    if let Some(count) = component
253                        .metadata
254                        .get("peer_count")
255                        .and_then(|v| v.as_u64())
256                    {
257                        network_peer_count = count;
258                    }
259                    if let Some(count) = component
260                        .metadata
261                        .get("active_connections")
262                        .and_then(|v| v.as_u64())
263                    {
264                        active_connections = count;
265                    }
266                }
267                "dht" => {
268                    if let Some(size) = component
269                        .metadata
270                        .get("routing_table_size")
271                        .and_then(|v| v.as_u64())
272                    {
273                        dht_routing_table_size = size;
274                    }
275                }
276                "resources" => {
277                    if let Some(ops) = component
278                        .metadata
279                        .get("dht_ops_per_sec")
280                        .and_then(|v| v.as_f64())
281                    {
282                        dht_ops_per_second = ops;
283                    }
284
285                    if let Some(bw) = component
286                        .metadata
287                        .get("bandwidth_usage")
288                        .and_then(|v| v.as_u64())
289                    {
290                        bandwidth_usage_bps = bw;
291                    }
292                }
293                "storage" => {
294                    if let Some(free) = component
295                        .metadata
296                        .get("free_space")
297                        .and_then(|v| v.as_u64())
298                    {
299                        storage_free_bytes = free;
300                    }
301                }
302                _ => {}
303            }
304        }
305
306        Ok(HealthMetrics {
307            uptime_seconds: health.uptime.as_secs_f64(),
308            healthy_components: healthy,
309            degraded_components: degraded,
310            unhealthy_components: unhealthy,
311            total_components: health.checks.len() as u64,
312            network_peer_count,
313            dht_routing_table_size,
314            active_connections,
315            memory_usage_bytes: debug_info.runtime.memory_usage,
316            cpu_usage_percent: 0.0, // Would need actual CPU monitoring
317            bandwidth_usage_bps,
318            storage_free_bytes,
319            dht_ops_per_second,
320        })
321    }
322}
323
324#[cfg(test)]
325mod tests {
326    use super::*;
327    use crate::health::HealthManager;
328
329    #[tokio::test]
330    async fn test_prometheus_export_basic() {
331        let health_manager = Arc::new(HealthManager::new("1.0.0".to_string()));
332        let exporter = PrometheusExporter::new(health_manager);
333
334        let metrics = exporter.export().await.unwrap();
335
336        // Check for required metric types
337        assert!(metrics.contains("# HELP p2p_node_info"));
338        assert!(metrics.contains("# TYPE p2p_node_info gauge"));
339        assert!(metrics.contains("p2p_node_info{"));
340
341        assert!(metrics.contains("# HELP p2p_uptime_seconds"));
342        assert!(metrics.contains("# TYPE p2p_uptime_seconds counter"));
343        assert!(metrics.contains("p2p_uptime_seconds"));
344
345        assert!(metrics.contains("# HELP p2p_health_status"));
346        assert!(metrics.contains("# TYPE p2p_health_status gauge"));
347
348        assert!(metrics.contains("# HELP p2p_last_scrape_timestamp_seconds"));
349        assert!(metrics.contains("# TYPE p2p_last_scrape_timestamp_seconds gauge"));
350    }
351
352    #[tokio::test]
353    async fn test_prometheus_export_with_components() {
354        let health_manager = Arc::new(HealthManager::new("1.0.0".to_string()));
355
356        // Add a mock component checker
357        struct MockChecker;
358        #[async_trait::async_trait]
359        impl crate::health::checks::ComponentChecker for MockChecker {
360            async fn check(&self) -> Result<HealthStatus> {
361                Ok(HealthStatus::Healthy)
362            }
363        }
364
365        health_manager
366            .register_checker("test_component", Box::new(MockChecker))
367            .await;
368
369        let exporter = PrometheusExporter::new(health_manager);
370        let metrics = exporter.export().await.unwrap();
371
372        // Check for component-specific metrics
373        assert!(metrics.contains("p2p_health_status{component=\"test_component\"}"));
374        assert!(metrics.contains("p2p_health_check_latency_ms{component=\"test_component\"}"));
375        assert!(metrics.contains("p2p_healthy_components 1"));
376        assert!(metrics.contains("p2p_degraded_components 0"));
377        assert!(metrics.contains("p2p_unhealthy_components 0"));
378    }
379
380    #[tokio::test]
381    async fn test_health_metrics_structure() {
382        let health_manager = Arc::new(HealthManager::new("1.0.0".to_string()));
383        let exporter = PrometheusExporter::new(health_manager);
384
385        let metrics = exporter.export_metrics().await.unwrap();
386
387        assert!(metrics.uptime_seconds >= 0.0);
388        assert_eq!(metrics.healthy_components, 0);
389        assert_eq!(metrics.degraded_components, 0);
390        assert_eq!(metrics.unhealthy_components, 0);
391        assert_eq!(metrics.total_components, 0);
392    }
393
394    #[tokio::test]
395    async fn test_prometheus_format_validation() {
396        let health_manager = Arc::new(HealthManager::new("1.0.0".to_string()));
397        let exporter = PrometheusExporter::new(health_manager);
398
399        let metrics = exporter.export().await.unwrap();
400
401        // Validate Prometheus format
402        for line in metrics.lines() {
403            if line.is_empty() {
404                continue;
405            }
406
407            // Comments should start with #
408            if line.starts_with('#') {
409                assert!(line.starts_with("# HELP") || line.starts_with("# TYPE"));
410                continue;
411            }
412
413            // Metric lines should have a space between name and value
414            if !line.starts_with('#') {
415                assert!(line.contains(' '));
416                let parts: Vec<&str> = line.splitn(2, ' ').collect();
417                assert_eq!(parts.len(), 2);
418
419                // Value should be numeric
420                let value = parts[1].trim();
421                assert!(
422                    value.parse::<f64>().is_ok(),
423                    "Invalid metric value: {}",
424                    value
425                );
426            }
427        }
428    }
429
430    #[tokio::test]
431    async fn test_export_with_metadata() {
432        let health_manager = Arc::new(HealthManager::new("1.0.0".to_string()));
433
434        // Add a mock component with metadata
435        struct MockCheckerWithMetadata;
436        #[async_trait::async_trait]
437        impl crate::health::checks::ComponentChecker for MockCheckerWithMetadata {
438            async fn check(&self) -> Result<HealthStatus> {
439                Ok(HealthStatus::Healthy)
440            }
441
442            async fn debug_info(&self) -> Option<serde_json::Value> {
443                Some(serde_json::json!({
444                    "peer_count": 10,
445                    "connection_rate": 5.5,
446                }))
447            }
448        }
449
450        health_manager
451            .register_checker("network", Box::new(MockCheckerWithMetadata))
452            .await;
453
454        let exporter = PrometheusExporter::new(health_manager);
455        let metrics = exporter.export_metrics().await.unwrap();
456
457        // The metadata extraction in export_metrics would need the component
458        // to include metadata in its health check result, not just debug_info
459        assert_eq!(metrics.total_components, 1);
460        assert_eq!(metrics.healthy_components, 1);
461    }
462}