Skip to main content

zlayer_observability/
metrics.rs

1//! Prometheus metrics exposition
2//!
3//! Provides a unified metrics interface and Prometheus exposition endpoint.
4
5use prometheus::{
6    Counter, CounterVec, Encoder, Gauge, GaugeVec, HistogramVec, Opts, Registry, TextEncoder,
7};
8use tracing::info;
9
10use crate::config::MetricsConfig;
11use crate::error::{ObservabilityError, Result};
12
13/// `ZLayer` metrics collection
14///
15/// Pre-defined metrics for the `ZLayer` system.
16pub struct ZLayerMetrics {
17    registry: Registry,
18
19    // Service metrics
20    /// Total number of registered services
21    pub services_total: Gauge,
22    /// Current replica count per service
23    pub service_replicas: GaugeVec,
24    /// Service health status (0=unknown, 1=healthy, 2=degraded, 3=unhealthy)
25    pub service_health: GaugeVec,
26
27    // Scaling metrics
28    /// Total scaling events by service and direction
29    pub scale_events_total: CounterVec,
30    /// Total scale up events
31    pub scale_up_total: Counter,
32    /// Total scale down events
33    pub scale_down_total: Counter,
34
35    // Request metrics
36    /// Total HTTP requests by method, path, and status
37    pub requests_total: CounterVec,
38    /// HTTP request duration in seconds
39    pub request_duration_seconds: HistogramVec,
40
41    // Raft metrics
42    /// Whether this node is the Raft leader (1) or not (0)
43    pub raft_is_leader: Gauge,
44    /// Current Raft term
45    pub raft_term: Gauge,
46    /// Current Raft commit index
47    pub raft_commit_index: Gauge,
48
49    // System metrics
50    /// Time since the service started in seconds
51    pub uptime_seconds: Gauge,
52
53    // GPU metrics
54    /// GPU utilization percentage per GPU index and node
55    pub gpu_utilization: GaugeVec,
56    /// GPU memory used in bytes per GPU index and node
57    pub gpu_memory_used: GaugeVec,
58    /// GPU total memory in bytes per GPU index and node
59    pub gpu_memory_total: GaugeVec,
60    /// GPU temperature in celsius per GPU index and node
61    pub gpu_temperature: GaugeVec,
62    /// GPU power draw in watts per GPU index and node
63    pub gpu_power: GaugeVec,
64}
65
66impl ZLayerMetrics {
67    /// Create a new metrics collection
68    ///
69    /// # Errors
70    /// Returns an error if Prometheus metric creation or registration fails.
71    #[allow(clippy::too_many_lines)]
72    pub fn new() -> Result<Self> {
73        let registry = Registry::new();
74
75        // Service metrics
76        let services_total = Gauge::new(
77            "zlayer_services_total",
78            "Total number of registered services",
79        )
80        .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
81
82        let service_replicas = GaugeVec::new(
83            Opts::new(
84                "zlayer_service_replicas",
85                "Current replica count per service",
86            ),
87            &["service"],
88        )
89        .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
90
91        let service_health = GaugeVec::new(
92            Opts::new(
93                "zlayer_service_health",
94                "Service health status (0=unknown, 1=healthy, 2=degraded, 3=unhealthy)",
95            ),
96            &["service"],
97        )
98        .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
99
100        // Scaling metrics
101        let scale_events_total = CounterVec::new(
102            Opts::new("zlayer_scale_events_total", "Total scaling events"),
103            &["service", "direction"],
104        )
105        .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
106
107        let scale_up_total = Counter::new("zlayer_scale_up_total", "Total scale up events")
108            .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
109
110        let scale_down_total = Counter::new("zlayer_scale_down_total", "Total scale down events")
111            .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
112
113        // Request metrics
114        let requests_total = CounterVec::new(
115            Opts::new("zlayer_requests_total", "Total HTTP requests"),
116            &["method", "path", "status"],
117        )
118        .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
119
120        let request_duration_seconds = HistogramVec::new(
121            prometheus::HistogramOpts::new(
122                "zlayer_request_duration_seconds",
123                "HTTP request duration in seconds",
124            )
125            .buckets(vec![
126                0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
127            ]),
128            &["method", "path"],
129        )
130        .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
131
132        // Raft metrics
133        let raft_is_leader = Gauge::new(
134            "zlayer_raft_is_leader",
135            "Whether this node is the Raft leader (1) or not (0)",
136        )
137        .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
138
139        let raft_term = Gauge::new("zlayer_raft_term", "Current Raft term")
140            .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
141
142        let raft_commit_index = Gauge::new("zlayer_raft_commit_index", "Current Raft commit index")
143            .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
144
145        // System metrics
146        let uptime_seconds = Gauge::new(
147            "zlayer_uptime_seconds",
148            "Time since the service started in seconds",
149        )
150        .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
151
152        // GPU metrics
153        let gpu_utilization = GaugeVec::new(
154            Opts::new(
155                "zlayer_gpu_utilization_percent",
156                "GPU utilization percentage",
157            ),
158            &["gpu_index", "node"],
159        )
160        .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
161
162        let gpu_memory_used = GaugeVec::new(
163            Opts::new("zlayer_gpu_memory_used_bytes", "GPU memory used in bytes"),
164            &["gpu_index", "node"],
165        )
166        .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
167
168        let gpu_memory_total = GaugeVec::new(
169            Opts::new("zlayer_gpu_memory_total_bytes", "GPU total memory in bytes"),
170            &["gpu_index", "node"],
171        )
172        .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
173
174        let gpu_temperature = GaugeVec::new(
175            Opts::new(
176                "zlayer_gpu_temperature_celsius",
177                "GPU temperature in celsius",
178            ),
179            &["gpu_index", "node"],
180        )
181        .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
182
183        let gpu_power = GaugeVec::new(
184            Opts::new("zlayer_gpu_power_watts", "GPU power draw in watts"),
185            &["gpu_index", "node"],
186        )
187        .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
188
189        // Register all metrics
190        registry.register(Box::new(services_total.clone())).ok();
191        registry.register(Box::new(service_replicas.clone())).ok();
192        registry.register(Box::new(service_health.clone())).ok();
193        registry.register(Box::new(scale_events_total.clone())).ok();
194        registry.register(Box::new(scale_up_total.clone())).ok();
195        registry.register(Box::new(scale_down_total.clone())).ok();
196        registry.register(Box::new(requests_total.clone())).ok();
197        registry
198            .register(Box::new(request_duration_seconds.clone()))
199            .ok();
200        registry.register(Box::new(raft_is_leader.clone())).ok();
201        registry.register(Box::new(raft_term.clone())).ok();
202        registry.register(Box::new(raft_commit_index.clone())).ok();
203        registry.register(Box::new(uptime_seconds.clone())).ok();
204        registry.register(Box::new(gpu_utilization.clone())).ok();
205        registry.register(Box::new(gpu_memory_used.clone())).ok();
206        registry.register(Box::new(gpu_memory_total.clone())).ok();
207        registry.register(Box::new(gpu_temperature.clone())).ok();
208        registry.register(Box::new(gpu_power.clone())).ok();
209
210        Ok(Self {
211            registry,
212            services_total,
213            service_replicas,
214            service_health,
215            scale_events_total,
216            scale_up_total,
217            scale_down_total,
218            requests_total,
219            request_duration_seconds,
220            raft_is_leader,
221            raft_term,
222            raft_commit_index,
223            uptime_seconds,
224            gpu_utilization,
225            gpu_memory_used,
226            gpu_memory_total,
227            gpu_temperature,
228            gpu_power,
229        })
230    }
231
232    /// Get the Prometheus registry
233    #[must_use]
234    pub fn registry(&self) -> &Registry {
235        &self.registry
236    }
237
238    /// Encode metrics in Prometheus text format
239    ///
240    /// # Errors
241    /// Returns an error if metric encoding fails.
242    pub fn encode(&self) -> Result<String> {
243        let encoder = TextEncoder::new();
244        let metric_families = self.registry.gather();
245        let mut buffer = Vec::new();
246        encoder
247            .encode(&metric_families, &mut buffer)
248            .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
249        String::from_utf8(buffer).map_err(|e| ObservabilityError::MetricsInit(e.to_string()))
250    }
251
252    /// Record a scale event
253    pub fn record_scale_event(&self, service: &str, direction: &str) {
254        self.scale_events_total
255            .with_label_values(&[service, direction])
256            .inc();
257
258        match direction {
259            "up" => self.scale_up_total.inc(),
260            "down" => self.scale_down_total.inc(),
261            _ => {}
262        }
263    }
264
265    /// Update service replica count
266    pub fn set_replicas(&self, service: &str, count: u32) {
267        self.service_replicas
268            .with_label_values(&[service])
269            .set(f64::from(count));
270    }
271
272    /// Update service health status
273    pub fn set_health(&self, service: &str, health: HealthStatus) {
274        self.service_health
275            .with_label_values(&[service])
276            .set(f64::from(health as i32));
277    }
278
279    /// Record an HTTP request
280    pub fn record_request(&self, method: &str, path: &str, status: u16, duration_secs: f64) {
281        let status_str = status.to_string();
282        self.requests_total
283            .with_label_values(&[method, path, &status_str])
284            .inc();
285        self.request_duration_seconds
286            .with_label_values(&[method, path])
287            .observe(duration_secs);
288    }
289
290    /// Update Raft leader status
291    pub fn set_raft_leader(&self, is_leader: bool) {
292        self.raft_is_leader.set(if is_leader { 1.0 } else { 0.0 });
293    }
294
295    /// Update Raft term
296    #[allow(clippy::cast_precision_loss)]
297    pub fn set_raft_term(&self, term: u64) {
298        self.raft_term.set(term as f64);
299    }
300
301    /// Update Raft commit index
302    #[allow(clippy::cast_precision_loss)]
303    pub fn set_raft_commit_index(&self, index: u64) {
304        self.raft_commit_index.set(index as f64);
305    }
306
307    /// Update uptime
308    pub fn set_uptime(&self, seconds: f64) {
309        self.uptime_seconds.set(seconds);
310    }
311
312    /// Update GPU utilization percentage
313    pub fn set_gpu_utilization(&self, gpu_index: u32, node: &str, percent: f64) {
314        self.gpu_utilization
315            .with_label_values(&[&gpu_index.to_string(), node])
316            .set(percent);
317    }
318
319    /// Update GPU memory used in bytes
320    #[allow(clippy::cast_precision_loss)]
321    pub fn set_gpu_memory_used(&self, gpu_index: u32, node: &str, bytes: u64) {
322        self.gpu_memory_used
323            .with_label_values(&[&gpu_index.to_string(), node])
324            .set(bytes as f64);
325    }
326
327    /// Update GPU total memory in bytes
328    #[allow(clippy::cast_precision_loss)]
329    pub fn set_gpu_memory_total(&self, gpu_index: u32, node: &str, bytes: u64) {
330        self.gpu_memory_total
331            .with_label_values(&[&gpu_index.to_string(), node])
332            .set(bytes as f64);
333    }
334
335    /// Update GPU temperature in celsius
336    pub fn set_gpu_temperature(&self, gpu_index: u32, node: &str, celsius: f64) {
337        self.gpu_temperature
338            .with_label_values(&[&gpu_index.to_string(), node])
339            .set(celsius);
340    }
341
342    /// Update GPU power draw in watts
343    pub fn set_gpu_power(&self, gpu_index: u32, node: &str, watts: f64) {
344        self.gpu_power
345            .with_label_values(&[&gpu_index.to_string(), node])
346            .set(watts);
347    }
348}
349
350impl Default for ZLayerMetrics {
351    fn default() -> Self {
352        Self::new().expect("Failed to create default metrics")
353    }
354}
355
356/// Health status values for metrics
357#[derive(Debug, Clone, Copy, PartialEq, Eq)]
358#[repr(i32)]
359pub enum HealthStatus {
360    /// Unknown health status
361    Unknown = 0,
362    /// Service is healthy
363    Healthy = 1,
364    /// Service is degraded
365    Degraded = 2,
366    /// Service is unhealthy
367    Unhealthy = 3,
368}
369
370/// Global metrics instance
371static METRICS: std::sync::OnceLock<ZLayerMetrics> = std::sync::OnceLock::new();
372
373/// Initialize global metrics
374///
375/// # Errors
376/// This function currently never fails but returns `Result` for API consistency.
377///
378/// # Panics
379/// Panics if `ZLayerMetrics::new()` fails during first initialization.
380#[allow(clippy::unnecessary_wraps)]
381pub fn init_metrics(config: &MetricsConfig) -> Result<&'static ZLayerMetrics> {
382    if !config.enabled {
383        info!("Metrics disabled by configuration");
384    }
385
386    METRICS.get_or_init(|| ZLayerMetrics::new().expect("Failed to initialize metrics"));
387
388    Ok(METRICS.get().unwrap())
389}
390
391/// Get the global metrics instance
392pub fn metrics() -> Option<&'static ZLayerMetrics> {
393    METRICS.get()
394}
395
396/// Axum handler for Prometheus metrics endpoint
397#[cfg(feature = "axum")]
398#[allow(clippy::unused_async)]
399pub async fn metrics_handler() -> impl axum::response::IntoResponse {
400    use axum::http::StatusCode;
401
402    match metrics() {
403        Some(m) => match m.encode() {
404            Ok(body) => (StatusCode::OK, body),
405            Err(e) => (StatusCode::INTERNAL_SERVER_ERROR, format!("Error: {e}")),
406        },
407        None => (
408            StatusCode::SERVICE_UNAVAILABLE,
409            "Metrics not initialized".to_string(),
410        ),
411    }
412}
413
414#[cfg(test)]
415mod tests {
416    use super::*;
417
418    #[test]
419    fn test_metrics_creation() {
420        let metrics = ZLayerMetrics::new().unwrap();
421
422        // Test basic operations
423        metrics.services_total.set(5.0);
424        metrics.set_replicas("api", 3);
425        metrics.record_scale_event("api", "up");
426
427        // Should be able to encode
428        let encoded = metrics.encode().unwrap();
429        assert!(encoded.contains("zlayer_services_total"));
430        assert!(encoded.contains("zlayer_service_replicas"));
431        assert!(encoded.contains("zlayer_scale_events_total"));
432    }
433
434    #[test]
435    fn test_request_recording() {
436        let metrics = ZLayerMetrics::new().unwrap();
437
438        metrics.record_request("GET", "/api/health", 200, 0.015);
439        metrics.record_request("POST", "/api/deploy", 201, 1.234);
440
441        let encoded = metrics.encode().unwrap();
442        assert!(encoded.contains("zlayer_requests_total"));
443        assert!(encoded.contains("zlayer_request_duration_seconds"));
444    }
445
446    #[test]
447    fn test_raft_metrics() {
448        let metrics = ZLayerMetrics::new().unwrap();
449
450        metrics.set_raft_leader(true);
451        metrics.set_raft_term(42);
452        metrics.set_raft_commit_index(100);
453
454        let encoded = metrics.encode().unwrap();
455        assert!(encoded.contains("zlayer_raft_is_leader 1"));
456        assert!(encoded.contains("zlayer_raft_term 42"));
457        assert!(encoded.contains("zlayer_raft_commit_index 100"));
458    }
459
460    #[test]
461    fn test_health_status() {
462        let metrics = ZLayerMetrics::new().unwrap();
463
464        metrics.set_health("api", HealthStatus::Healthy);
465        metrics.set_health("db", HealthStatus::Degraded);
466
467        let encoded = metrics.encode().unwrap();
468        assert!(encoded.contains("zlayer_service_health"));
469    }
470
471    #[test]
472    fn test_scale_events() {
473        let metrics = ZLayerMetrics::new().unwrap();
474
475        metrics.record_scale_event("api", "up");
476        metrics.record_scale_event("api", "up");
477        metrics.record_scale_event("api", "down");
478
479        let encoded = metrics.encode().unwrap();
480        assert!(encoded.contains("zlayer_scale_up_total 2"));
481        assert!(encoded.contains("zlayer_scale_down_total 1"));
482    }
483
484    #[test]
485    fn test_gpu_metrics() {
486        let metrics = ZLayerMetrics::new().unwrap();
487
488        metrics.set_gpu_utilization(0, "node-1", 85.5);
489        metrics.set_gpu_memory_used(0, "node-1", 8_589_934_592);
490        metrics.set_gpu_memory_total(0, "node-1", 17_179_869_184);
491        metrics.set_gpu_temperature(0, "node-1", 72.0);
492        metrics.set_gpu_power(0, "node-1", 250.0);
493
494        // Second GPU on same node
495        metrics.set_gpu_utilization(1, "node-1", 42.0);
496        metrics.set_gpu_temperature(1, "node-1", 65.5);
497
498        let encoded = metrics.encode().unwrap();
499        assert!(encoded.contains("zlayer_gpu_utilization_percent"));
500        assert!(encoded.contains("zlayer_gpu_memory_used_bytes"));
501        assert!(encoded.contains("zlayer_gpu_memory_total_bytes"));
502        assert!(encoded.contains("zlayer_gpu_temperature_celsius"));
503        assert!(encoded.contains("zlayer_gpu_power_watts"));
504    }
505}