1use prometheus::{
6 Counter, CounterVec, Encoder, Gauge, GaugeVec, HistogramVec, Opts, Registry, TextEncoder,
7};
8use tracing::info;
9
10use crate::config::MetricsConfig;
11use crate::error::{ObservabilityError, Result};
12
13pub struct ZLayerMetrics {
17 registry: Registry,
18
19 pub services_total: Gauge,
22 pub service_replicas: GaugeVec,
24 pub service_health: GaugeVec,
26
27 pub scale_events_total: CounterVec,
30 pub scale_up_total: Counter,
32 pub scale_down_total: Counter,
34
35 pub requests_total: CounterVec,
38 pub request_duration_seconds: HistogramVec,
40
41 pub raft_is_leader: Gauge,
44 pub raft_term: Gauge,
46 pub raft_commit_index: Gauge,
48
49 pub uptime_seconds: Gauge,
52
53 pub gpu_utilization: GaugeVec,
56 pub gpu_memory_used: GaugeVec,
58 pub gpu_memory_total: GaugeVec,
60 pub gpu_temperature: GaugeVec,
62 pub gpu_power: GaugeVec,
64}
65
66impl ZLayerMetrics {
67 #[allow(clippy::too_many_lines)]
72 pub fn new() -> Result<Self> {
73 let registry = Registry::new();
74
75 let services_total = Gauge::new(
77 "zlayer_services_total",
78 "Total number of registered services",
79 )
80 .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
81
82 let service_replicas = GaugeVec::new(
83 Opts::new(
84 "zlayer_service_replicas",
85 "Current replica count per service",
86 ),
87 &["service"],
88 )
89 .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
90
91 let service_health = GaugeVec::new(
92 Opts::new(
93 "zlayer_service_health",
94 "Service health status (0=unknown, 1=healthy, 2=degraded, 3=unhealthy)",
95 ),
96 &["service"],
97 )
98 .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
99
100 let scale_events_total = CounterVec::new(
102 Opts::new("zlayer_scale_events_total", "Total scaling events"),
103 &["service", "direction"],
104 )
105 .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
106
107 let scale_up_total = Counter::new("zlayer_scale_up_total", "Total scale up events")
108 .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
109
110 let scale_down_total = Counter::new("zlayer_scale_down_total", "Total scale down events")
111 .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
112
113 let requests_total = CounterVec::new(
115 Opts::new("zlayer_requests_total", "Total HTTP requests"),
116 &["method", "path", "status"],
117 )
118 .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
119
120 let request_duration_seconds = HistogramVec::new(
121 prometheus::HistogramOpts::new(
122 "zlayer_request_duration_seconds",
123 "HTTP request duration in seconds",
124 )
125 .buckets(vec![
126 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
127 ]),
128 &["method", "path"],
129 )
130 .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
131
132 let raft_is_leader = Gauge::new(
134 "zlayer_raft_is_leader",
135 "Whether this node is the Raft leader (1) or not (0)",
136 )
137 .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
138
139 let raft_term = Gauge::new("zlayer_raft_term", "Current Raft term")
140 .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
141
142 let raft_commit_index = Gauge::new("zlayer_raft_commit_index", "Current Raft commit index")
143 .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
144
145 let uptime_seconds = Gauge::new(
147 "zlayer_uptime_seconds",
148 "Time since the service started in seconds",
149 )
150 .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
151
152 let gpu_utilization = GaugeVec::new(
154 Opts::new(
155 "zlayer_gpu_utilization_percent",
156 "GPU utilization percentage",
157 ),
158 &["gpu_index", "node"],
159 )
160 .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
161
162 let gpu_memory_used = GaugeVec::new(
163 Opts::new("zlayer_gpu_memory_used_bytes", "GPU memory used in bytes"),
164 &["gpu_index", "node"],
165 )
166 .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
167
168 let gpu_memory_total = GaugeVec::new(
169 Opts::new("zlayer_gpu_memory_total_bytes", "GPU total memory in bytes"),
170 &["gpu_index", "node"],
171 )
172 .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
173
174 let gpu_temperature = GaugeVec::new(
175 Opts::new(
176 "zlayer_gpu_temperature_celsius",
177 "GPU temperature in celsius",
178 ),
179 &["gpu_index", "node"],
180 )
181 .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
182
183 let gpu_power = GaugeVec::new(
184 Opts::new("zlayer_gpu_power_watts", "GPU power draw in watts"),
185 &["gpu_index", "node"],
186 )
187 .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
188
189 registry.register(Box::new(services_total.clone())).ok();
191 registry.register(Box::new(service_replicas.clone())).ok();
192 registry.register(Box::new(service_health.clone())).ok();
193 registry.register(Box::new(scale_events_total.clone())).ok();
194 registry.register(Box::new(scale_up_total.clone())).ok();
195 registry.register(Box::new(scale_down_total.clone())).ok();
196 registry.register(Box::new(requests_total.clone())).ok();
197 registry
198 .register(Box::new(request_duration_seconds.clone()))
199 .ok();
200 registry.register(Box::new(raft_is_leader.clone())).ok();
201 registry.register(Box::new(raft_term.clone())).ok();
202 registry.register(Box::new(raft_commit_index.clone())).ok();
203 registry.register(Box::new(uptime_seconds.clone())).ok();
204 registry.register(Box::new(gpu_utilization.clone())).ok();
205 registry.register(Box::new(gpu_memory_used.clone())).ok();
206 registry.register(Box::new(gpu_memory_total.clone())).ok();
207 registry.register(Box::new(gpu_temperature.clone())).ok();
208 registry.register(Box::new(gpu_power.clone())).ok();
209
210 Ok(Self {
211 registry,
212 services_total,
213 service_replicas,
214 service_health,
215 scale_events_total,
216 scale_up_total,
217 scale_down_total,
218 requests_total,
219 request_duration_seconds,
220 raft_is_leader,
221 raft_term,
222 raft_commit_index,
223 uptime_seconds,
224 gpu_utilization,
225 gpu_memory_used,
226 gpu_memory_total,
227 gpu_temperature,
228 gpu_power,
229 })
230 }
231
232 #[must_use]
234 pub fn registry(&self) -> &Registry {
235 &self.registry
236 }
237
238 pub fn encode(&self) -> Result<String> {
243 let encoder = TextEncoder::new();
244 let metric_families = self.registry.gather();
245 let mut buffer = Vec::new();
246 encoder
247 .encode(&metric_families, &mut buffer)
248 .map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
249 String::from_utf8(buffer).map_err(|e| ObservabilityError::MetricsInit(e.to_string()))
250 }
251
252 pub fn record_scale_event(&self, service: &str, direction: &str) {
254 self.scale_events_total
255 .with_label_values(&[service, direction])
256 .inc();
257
258 match direction {
259 "up" => self.scale_up_total.inc(),
260 "down" => self.scale_down_total.inc(),
261 _ => {}
262 }
263 }
264
265 pub fn set_replicas(&self, service: &str, count: u32) {
267 self.service_replicas
268 .with_label_values(&[service])
269 .set(f64::from(count));
270 }
271
272 pub fn set_health(&self, service: &str, health: HealthStatus) {
274 self.service_health
275 .with_label_values(&[service])
276 .set(f64::from(health as i32));
277 }
278
279 pub fn record_request(&self, method: &str, path: &str, status: u16, duration_secs: f64) {
281 let status_str = status.to_string();
282 self.requests_total
283 .with_label_values(&[method, path, &status_str])
284 .inc();
285 self.request_duration_seconds
286 .with_label_values(&[method, path])
287 .observe(duration_secs);
288 }
289
290 pub fn set_raft_leader(&self, is_leader: bool) {
292 self.raft_is_leader.set(if is_leader { 1.0 } else { 0.0 });
293 }
294
295 #[allow(clippy::cast_precision_loss)]
297 pub fn set_raft_term(&self, term: u64) {
298 self.raft_term.set(term as f64);
299 }
300
301 #[allow(clippy::cast_precision_loss)]
303 pub fn set_raft_commit_index(&self, index: u64) {
304 self.raft_commit_index.set(index as f64);
305 }
306
307 pub fn set_uptime(&self, seconds: f64) {
309 self.uptime_seconds.set(seconds);
310 }
311
312 pub fn set_gpu_utilization(&self, gpu_index: u32, node: &str, percent: f64) {
314 self.gpu_utilization
315 .with_label_values(&[&gpu_index.to_string(), node])
316 .set(percent);
317 }
318
319 #[allow(clippy::cast_precision_loss)]
321 pub fn set_gpu_memory_used(&self, gpu_index: u32, node: &str, bytes: u64) {
322 self.gpu_memory_used
323 .with_label_values(&[&gpu_index.to_string(), node])
324 .set(bytes as f64);
325 }
326
327 #[allow(clippy::cast_precision_loss)]
329 pub fn set_gpu_memory_total(&self, gpu_index: u32, node: &str, bytes: u64) {
330 self.gpu_memory_total
331 .with_label_values(&[&gpu_index.to_string(), node])
332 .set(bytes as f64);
333 }
334
335 pub fn set_gpu_temperature(&self, gpu_index: u32, node: &str, celsius: f64) {
337 self.gpu_temperature
338 .with_label_values(&[&gpu_index.to_string(), node])
339 .set(celsius);
340 }
341
342 pub fn set_gpu_power(&self, gpu_index: u32, node: &str, watts: f64) {
344 self.gpu_power
345 .with_label_values(&[&gpu_index.to_string(), node])
346 .set(watts);
347 }
348}
349
350impl Default for ZLayerMetrics {
351 fn default() -> Self {
352 Self::new().expect("Failed to create default metrics")
353 }
354}
355
356#[derive(Debug, Clone, Copy, PartialEq, Eq)]
358#[repr(i32)]
359pub enum HealthStatus {
360 Unknown = 0,
362 Healthy = 1,
364 Degraded = 2,
366 Unhealthy = 3,
368}
369
370static METRICS: std::sync::OnceLock<ZLayerMetrics> = std::sync::OnceLock::new();
372
373#[allow(clippy::unnecessary_wraps)]
381pub fn init_metrics(config: &MetricsConfig) -> Result<&'static ZLayerMetrics> {
382 if !config.enabled {
383 info!("Metrics disabled by configuration");
384 }
385
386 METRICS.get_or_init(|| ZLayerMetrics::new().expect("Failed to initialize metrics"));
387
388 Ok(METRICS.get().unwrap())
389}
390
391pub fn metrics() -> Option<&'static ZLayerMetrics> {
393 METRICS.get()
394}
395
396#[cfg(feature = "axum")]
398#[allow(clippy::unused_async)]
399pub async fn metrics_handler() -> impl axum::response::IntoResponse {
400 use axum::http::StatusCode;
401
402 match metrics() {
403 Some(m) => match m.encode() {
404 Ok(body) => (StatusCode::OK, body),
405 Err(e) => (StatusCode::INTERNAL_SERVER_ERROR, format!("Error: {e}")),
406 },
407 None => (
408 StatusCode::SERVICE_UNAVAILABLE,
409 "Metrics not initialized".to_string(),
410 ),
411 }
412}
413
414#[cfg(test)]
415mod tests {
416 use super::*;
417
418 #[test]
419 fn test_metrics_creation() {
420 let metrics = ZLayerMetrics::new().unwrap();
421
422 metrics.services_total.set(5.0);
424 metrics.set_replicas("api", 3);
425 metrics.record_scale_event("api", "up");
426
427 let encoded = metrics.encode().unwrap();
429 assert!(encoded.contains("zlayer_services_total"));
430 assert!(encoded.contains("zlayer_service_replicas"));
431 assert!(encoded.contains("zlayer_scale_events_total"));
432 }
433
434 #[test]
435 fn test_request_recording() {
436 let metrics = ZLayerMetrics::new().unwrap();
437
438 metrics.record_request("GET", "/api/health", 200, 0.015);
439 metrics.record_request("POST", "/api/deploy", 201, 1.234);
440
441 let encoded = metrics.encode().unwrap();
442 assert!(encoded.contains("zlayer_requests_total"));
443 assert!(encoded.contains("zlayer_request_duration_seconds"));
444 }
445
446 #[test]
447 fn test_raft_metrics() {
448 let metrics = ZLayerMetrics::new().unwrap();
449
450 metrics.set_raft_leader(true);
451 metrics.set_raft_term(42);
452 metrics.set_raft_commit_index(100);
453
454 let encoded = metrics.encode().unwrap();
455 assert!(encoded.contains("zlayer_raft_is_leader 1"));
456 assert!(encoded.contains("zlayer_raft_term 42"));
457 assert!(encoded.contains("zlayer_raft_commit_index 100"));
458 }
459
460 #[test]
461 fn test_health_status() {
462 let metrics = ZLayerMetrics::new().unwrap();
463
464 metrics.set_health("api", HealthStatus::Healthy);
465 metrics.set_health("db", HealthStatus::Degraded);
466
467 let encoded = metrics.encode().unwrap();
468 assert!(encoded.contains("zlayer_service_health"));
469 }
470
471 #[test]
472 fn test_scale_events() {
473 let metrics = ZLayerMetrics::new().unwrap();
474
475 metrics.record_scale_event("api", "up");
476 metrics.record_scale_event("api", "up");
477 metrics.record_scale_event("api", "down");
478
479 let encoded = metrics.encode().unwrap();
480 assert!(encoded.contains("zlayer_scale_up_total 2"));
481 assert!(encoded.contains("zlayer_scale_down_total 1"));
482 }
483
484 #[test]
485 fn test_gpu_metrics() {
486 let metrics = ZLayerMetrics::new().unwrap();
487
488 metrics.set_gpu_utilization(0, "node-1", 85.5);
489 metrics.set_gpu_memory_used(0, "node-1", 8_589_934_592);
490 metrics.set_gpu_memory_total(0, "node-1", 17_179_869_184);
491 metrics.set_gpu_temperature(0, "node-1", 72.0);
492 metrics.set_gpu_power(0, "node-1", 250.0);
493
494 metrics.set_gpu_utilization(1, "node-1", 42.0);
496 metrics.set_gpu_temperature(1, "node-1", 65.5);
497
498 let encoded = metrics.encode().unwrap();
499 assert!(encoded.contains("zlayer_gpu_utilization_percent"));
500 assert!(encoded.contains("zlayer_gpu_memory_used_bytes"));
501 assert!(encoded.contains("zlayer_gpu_memory_total_bytes"));
502 assert!(encoded.contains("zlayer_gpu_temperature_celsius"));
503 assert!(encoded.contains("zlayer_gpu_power_watts"));
504 }
505}