use prometheus::{
Counter, CounterVec, Encoder, Gauge, GaugeVec, HistogramVec, Opts, Registry, TextEncoder,
};
use tracing::info;
use crate::config::MetricsConfig;
use crate::error::{ObservabilityError, Result};
pub struct ZLayerMetrics {
registry: Registry,
pub services_total: Gauge,
pub service_replicas: GaugeVec,
pub service_health: GaugeVec,
pub scale_events_total: CounterVec,
pub scale_up_total: Counter,
pub scale_down_total: Counter,
pub requests_total: CounterVec,
pub request_duration_seconds: HistogramVec,
pub raft_is_leader: Gauge,
pub raft_term: Gauge,
pub raft_commit_index: Gauge,
pub uptime_seconds: Gauge,
pub gpu_utilization: GaugeVec,
pub gpu_memory_used: GaugeVec,
pub gpu_memory_total: GaugeVec,
pub gpu_temperature: GaugeVec,
pub gpu_power: GaugeVec,
}
impl ZLayerMetrics {
#[allow(clippy::too_many_lines)]
pub fn new() -> Result<Self> {
let registry = Registry::new();
let services_total = Gauge::new(
"zlayer_services_total",
"Total number of registered services",
)
.map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
let service_replicas = GaugeVec::new(
Opts::new(
"zlayer_service_replicas",
"Current replica count per service",
),
&["service"],
)
.map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
let service_health = GaugeVec::new(
Opts::new(
"zlayer_service_health",
"Service health status (0=unknown, 1=healthy, 2=degraded, 3=unhealthy)",
),
&["service"],
)
.map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
let scale_events_total = CounterVec::new(
Opts::new("zlayer_scale_events_total", "Total scaling events"),
&["service", "direction"],
)
.map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
let scale_up_total = Counter::new("zlayer_scale_up_total", "Total scale up events")
.map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
let scale_down_total = Counter::new("zlayer_scale_down_total", "Total scale down events")
.map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
let requests_total = CounterVec::new(
Opts::new("zlayer_requests_total", "Total HTTP requests"),
&["method", "path", "status"],
)
.map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
let request_duration_seconds = HistogramVec::new(
prometheus::HistogramOpts::new(
"zlayer_request_duration_seconds",
"HTTP request duration in seconds",
)
.buckets(vec![
0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
]),
&["method", "path"],
)
.map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
let raft_is_leader = Gauge::new(
"zlayer_raft_is_leader",
"Whether this node is the Raft leader (1) or not (0)",
)
.map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
let raft_term = Gauge::new("zlayer_raft_term", "Current Raft term")
.map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
let raft_commit_index = Gauge::new("zlayer_raft_commit_index", "Current Raft commit index")
.map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
let uptime_seconds = Gauge::new(
"zlayer_uptime_seconds",
"Time since the service started in seconds",
)
.map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
let gpu_utilization = GaugeVec::new(
Opts::new(
"zlayer_gpu_utilization_percent",
"GPU utilization percentage",
),
&["gpu_index", "node"],
)
.map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
let gpu_memory_used = GaugeVec::new(
Opts::new("zlayer_gpu_memory_used_bytes", "GPU memory used in bytes"),
&["gpu_index", "node"],
)
.map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
let gpu_memory_total = GaugeVec::new(
Opts::new("zlayer_gpu_memory_total_bytes", "GPU total memory in bytes"),
&["gpu_index", "node"],
)
.map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
let gpu_temperature = GaugeVec::new(
Opts::new(
"zlayer_gpu_temperature_celsius",
"GPU temperature in celsius",
),
&["gpu_index", "node"],
)
.map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
let gpu_power = GaugeVec::new(
Opts::new("zlayer_gpu_power_watts", "GPU power draw in watts"),
&["gpu_index", "node"],
)
.map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
registry.register(Box::new(services_total.clone())).ok();
registry.register(Box::new(service_replicas.clone())).ok();
registry.register(Box::new(service_health.clone())).ok();
registry.register(Box::new(scale_events_total.clone())).ok();
registry.register(Box::new(scale_up_total.clone())).ok();
registry.register(Box::new(scale_down_total.clone())).ok();
registry.register(Box::new(requests_total.clone())).ok();
registry
.register(Box::new(request_duration_seconds.clone()))
.ok();
registry.register(Box::new(raft_is_leader.clone())).ok();
registry.register(Box::new(raft_term.clone())).ok();
registry.register(Box::new(raft_commit_index.clone())).ok();
registry.register(Box::new(uptime_seconds.clone())).ok();
registry.register(Box::new(gpu_utilization.clone())).ok();
registry.register(Box::new(gpu_memory_used.clone())).ok();
registry.register(Box::new(gpu_memory_total.clone())).ok();
registry.register(Box::new(gpu_temperature.clone())).ok();
registry.register(Box::new(gpu_power.clone())).ok();
Ok(Self {
registry,
services_total,
service_replicas,
service_health,
scale_events_total,
scale_up_total,
scale_down_total,
requests_total,
request_duration_seconds,
raft_is_leader,
raft_term,
raft_commit_index,
uptime_seconds,
gpu_utilization,
gpu_memory_used,
gpu_memory_total,
gpu_temperature,
gpu_power,
})
}
#[must_use]
pub fn registry(&self) -> &Registry {
&self.registry
}
pub fn encode(&self) -> Result<String> {
let encoder = TextEncoder::new();
let metric_families = self.registry.gather();
let mut buffer = Vec::new();
encoder
.encode(&metric_families, &mut buffer)
.map_err(|e| ObservabilityError::MetricsInit(e.to_string()))?;
String::from_utf8(buffer).map_err(|e| ObservabilityError::MetricsInit(e.to_string()))
}
pub fn record_scale_event(&self, service: &str, direction: &str) {
self.scale_events_total
.with_label_values(&[service, direction])
.inc();
match direction {
"up" => self.scale_up_total.inc(),
"down" => self.scale_down_total.inc(),
_ => {}
}
}
pub fn set_replicas(&self, service: &str, count: u32) {
self.service_replicas
.with_label_values(&[service])
.set(f64::from(count));
}
pub fn set_health(&self, service: &str, health: HealthStatus) {
self.service_health
.with_label_values(&[service])
.set(f64::from(health as i32));
}
pub fn record_request(&self, method: &str, path: &str, status: u16, duration_secs: f64) {
let status_str = status.to_string();
self.requests_total
.with_label_values(&[method, path, &status_str])
.inc();
self.request_duration_seconds
.with_label_values(&[method, path])
.observe(duration_secs);
}
pub fn set_raft_leader(&self, is_leader: bool) {
self.raft_is_leader.set(if is_leader { 1.0 } else { 0.0 });
}
#[allow(clippy::cast_precision_loss)]
pub fn set_raft_term(&self, term: u64) {
self.raft_term.set(term as f64);
}
#[allow(clippy::cast_precision_loss)]
pub fn set_raft_commit_index(&self, index: u64) {
self.raft_commit_index.set(index as f64);
}
pub fn set_uptime(&self, seconds: f64) {
self.uptime_seconds.set(seconds);
}
pub fn set_gpu_utilization(&self, gpu_index: u32, node: &str, percent: f64) {
self.gpu_utilization
.with_label_values(&[&gpu_index.to_string(), node])
.set(percent);
}
#[allow(clippy::cast_precision_loss)]
pub fn set_gpu_memory_used(&self, gpu_index: u32, node: &str, bytes: u64) {
self.gpu_memory_used
.with_label_values(&[&gpu_index.to_string(), node])
.set(bytes as f64);
}
#[allow(clippy::cast_precision_loss)]
pub fn set_gpu_memory_total(&self, gpu_index: u32, node: &str, bytes: u64) {
self.gpu_memory_total
.with_label_values(&[&gpu_index.to_string(), node])
.set(bytes as f64);
}
pub fn set_gpu_temperature(&self, gpu_index: u32, node: &str, celsius: f64) {
self.gpu_temperature
.with_label_values(&[&gpu_index.to_string(), node])
.set(celsius);
}
pub fn set_gpu_power(&self, gpu_index: u32, node: &str, watts: f64) {
self.gpu_power
.with_label_values(&[&gpu_index.to_string(), node])
.set(watts);
}
}
impl Default for ZLayerMetrics {
fn default() -> Self {
Self::new().expect("Failed to create default metrics")
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(i32)]
pub enum HealthStatus {
Unknown = 0,
Healthy = 1,
Degraded = 2,
Unhealthy = 3,
}
static METRICS: std::sync::OnceLock<ZLayerMetrics> = std::sync::OnceLock::new();
#[allow(clippy::unnecessary_wraps)]
pub fn init_metrics(config: &MetricsConfig) -> Result<&'static ZLayerMetrics> {
if !config.enabled {
info!("Metrics disabled by configuration");
}
METRICS.get_or_init(|| ZLayerMetrics::new().expect("Failed to initialize metrics"));
Ok(METRICS.get().unwrap())
}
pub fn metrics() -> Option<&'static ZLayerMetrics> {
METRICS.get()
}
#[cfg(feature = "axum")]
#[allow(clippy::unused_async)]
pub async fn metrics_handler() -> impl axum::response::IntoResponse {
use axum::http::StatusCode;
match metrics() {
Some(m) => match m.encode() {
Ok(body) => (StatusCode::OK, body),
Err(e) => (StatusCode::INTERNAL_SERVER_ERROR, format!("Error: {e}")),
},
None => (
StatusCode::SERVICE_UNAVAILABLE,
"Metrics not initialized".to_string(),
),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_metrics_creation() {
let metrics = ZLayerMetrics::new().unwrap();
metrics.services_total.set(5.0);
metrics.set_replicas("api", 3);
metrics.record_scale_event("api", "up");
let encoded = metrics.encode().unwrap();
assert!(encoded.contains("zlayer_services_total"));
assert!(encoded.contains("zlayer_service_replicas"));
assert!(encoded.contains("zlayer_scale_events_total"));
}
#[test]
fn test_request_recording() {
let metrics = ZLayerMetrics::new().unwrap();
metrics.record_request("GET", "/api/health", 200, 0.015);
metrics.record_request("POST", "/api/deploy", 201, 1.234);
let encoded = metrics.encode().unwrap();
assert!(encoded.contains("zlayer_requests_total"));
assert!(encoded.contains("zlayer_request_duration_seconds"));
}
#[test]
fn test_raft_metrics() {
let metrics = ZLayerMetrics::new().unwrap();
metrics.set_raft_leader(true);
metrics.set_raft_term(42);
metrics.set_raft_commit_index(100);
let encoded = metrics.encode().unwrap();
assert!(encoded.contains("zlayer_raft_is_leader 1"));
assert!(encoded.contains("zlayer_raft_term 42"));
assert!(encoded.contains("zlayer_raft_commit_index 100"));
}
#[test]
fn test_health_status() {
let metrics = ZLayerMetrics::new().unwrap();
metrics.set_health("api", HealthStatus::Healthy);
metrics.set_health("db", HealthStatus::Degraded);
let encoded = metrics.encode().unwrap();
assert!(encoded.contains("zlayer_service_health"));
}
#[test]
fn test_scale_events() {
let metrics = ZLayerMetrics::new().unwrap();
metrics.record_scale_event("api", "up");
metrics.record_scale_event("api", "up");
metrics.record_scale_event("api", "down");
let encoded = metrics.encode().unwrap();
assert!(encoded.contains("zlayer_scale_up_total 2"));
assert!(encoded.contains("zlayer_scale_down_total 1"));
}
#[test]
fn test_gpu_metrics() {
let metrics = ZLayerMetrics::new().unwrap();
metrics.set_gpu_utilization(0, "node-1", 85.5);
metrics.set_gpu_memory_used(0, "node-1", 8_589_934_592);
metrics.set_gpu_memory_total(0, "node-1", 17_179_869_184);
metrics.set_gpu_temperature(0, "node-1", 72.0);
metrics.set_gpu_power(0, "node-1", 250.0);
metrics.set_gpu_utilization(1, "node-1", 42.0);
metrics.set_gpu_temperature(1, "node-1", 65.5);
let encoded = metrics.encode().unwrap();
assert!(encoded.contains("zlayer_gpu_utilization_percent"));
assert!(encoded.contains("zlayer_gpu_memory_used_bytes"));
assert!(encoded.contains("zlayer_gpu_memory_total_bytes"));
assert!(encoded.contains("zlayer_gpu_temperature_celsius"));
assert!(encoded.contains("zlayer_gpu_power_watts"));
}
}