use std::sync::Arc;
use std::time::Duration;
use prometheus::{
Gauge, GaugeVec, Histogram, HistogramOpts, HistogramVec,
IntCounter, IntCounterVec, IntGauge, IntGaugeVec, Opts, Registry,
TextEncoder, Encoder,
};
pub struct PrometheusMetrics {
registry: Registry,
pub connections_total: IntCounter,
pub connections_active: IntGauge,
pub connections_by_user: IntGaugeVec,
pub sessions_total: IntCounter,
pub sessions_active: IntGauge,
pub session_duration_seconds: Histogram,
pub uplinks_total: IntGauge,
pub uplinks_active: IntGauge,
pub uplink_state: IntGaugeVec,
pub uplink_health: GaugeVec,
pub bytes_sent_total: IntCounterVec,
pub bytes_received_total: IntCounterVec,
pub packets_sent_total: IntCounterVec,
pub packets_received_total: IntCounterVec,
pub packets_dropped_total: IntCounterVec,
pub packets_retransmitted_total: IntCounterVec,
pub rtt_seconds: GaugeVec,
pub rtt_histogram: HistogramVec,
pub jitter_seconds: GaugeVec,
pub packet_loss_ratio: GaugeVec,
pub quality_score: GaugeVec,
pub bandwidth_bytes_per_sec: GaugeVec,
pub nat_type: IntGaugeVec,
pub external_port: IntGaugeVec,
pub active_flows: IntGauge,
pub flow_bindings: IntGaugeVec,
pub scheduler_selections_total: IntCounterVec,
pub scheduler_latency_seconds: Histogram,
pub handshakes_total: IntCounter,
pub handshakes_failed: IntCounter,
pub rekeys_total: IntCounter,
pub encrypt_operations: IntCounter,
pub decrypt_operations: IntCounter,
pub errors_total: IntCounterVec,
pub server_uptime_seconds: Gauge,
pub users_total: IntGauge,
pub users_active: IntGauge,
}
impl PrometheusMetrics {
pub fn new() -> Result<Self, prometheus::Error> {
let registry = Registry::new();
let connections_total = IntCounter::new(
"triglav_connections_total",
"Total number of connections established"
)?;
let connections_active = IntGauge::new(
"triglav_connections_active",
"Number of currently active connections"
)?;
let connections_by_user = IntGaugeVec::new(
Opts::new("triglav_connections_by_user", "Active connections per user"),
&["user_id"]
)?;
let sessions_total = IntCounter::new(
"triglav_sessions_total",
"Total number of sessions created"
)?;
let sessions_active = IntGauge::new(
"triglav_sessions_active",
"Number of currently active sessions"
)?;
let session_duration_seconds = Histogram::with_opts(
HistogramOpts::new(
"triglav_session_duration_seconds",
"Session duration in seconds"
).buckets(vec![1.0, 5.0, 10.0, 30.0, 60.0, 300.0, 600.0, 1800.0, 3600.0])
)?;
let uplinks_total = IntGauge::new(
"triglav_uplinks_total",
"Total number of configured uplinks"
)?;
let uplinks_active = IntGauge::new(
"triglav_uplinks_active",
"Number of currently active/usable uplinks"
)?;
let uplink_state = IntGaugeVec::new(
Opts::new("triglav_uplink_state", "Uplink connection state (0=disconnected, 1=connecting, 2=connected, 3=failed)"),
&["uplink_id", "interface"]
)?;
let uplink_health = GaugeVec::new(
Opts::new("triglav_uplink_health", "Uplink health score (0-1)"),
&["uplink_id", "interface"]
)?;
let bytes_sent_total = IntCounterVec::new(
Opts::new("triglav_bytes_sent_total", "Total bytes sent"),
&["uplink_id"]
)?;
let bytes_received_total = IntCounterVec::new(
Opts::new("triglav_bytes_received_total", "Total bytes received"),
&["uplink_id"]
)?;
let packets_sent_total = IntCounterVec::new(
Opts::new("triglav_packets_sent_total", "Total packets sent"),
&["uplink_id"]
)?;
let packets_received_total = IntCounterVec::new(
Opts::new("triglav_packets_received_total", "Total packets received"),
&["uplink_id"]
)?;
let packets_dropped_total = IntCounterVec::new(
Opts::new("triglav_packets_dropped_total", "Total packets dropped"),
&["uplink_id", "reason"]
)?;
let packets_retransmitted_total = IntCounterVec::new(
Opts::new("triglav_packets_retransmitted_total", "Total packets retransmitted"),
&["uplink_id"]
)?;
let rtt_seconds = GaugeVec::new(
Opts::new("triglav_rtt_seconds", "Current smoothed RTT in seconds"),
&["uplink_id"]
)?;
let rtt_histogram = HistogramVec::new(
HistogramOpts::new(
"triglav_rtt_histogram_seconds",
"RTT distribution in seconds"
).buckets(vec![0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5]),
&["uplink_id"]
)?;
let jitter_seconds = GaugeVec::new(
Opts::new("triglav_jitter_seconds", "Current jitter (RTT variance) in seconds"),
&["uplink_id"]
)?;
let packet_loss_ratio = GaugeVec::new(
Opts::new("triglav_packet_loss_ratio", "Packet loss ratio (0-1)"),
&["uplink_id"]
)?;
let quality_score = GaugeVec::new(
Opts::new("triglav_quality_score", "Overall quality score (0-100)"),
&["uplink_id"]
)?;
let bandwidth_bytes_per_sec = GaugeVec::new(
Opts::new("triglav_bandwidth_bytes_per_sec", "Estimated bandwidth in bytes/sec"),
&["uplink_id", "direction"]
)?;
let nat_type = IntGaugeVec::new(
Opts::new("triglav_nat_type", "NAT type (0=none, 1=full_cone, 2=restricted, 3=port_restricted, 4=symmetric, 5=unknown)"),
&["uplink_id"]
)?;
let external_port = IntGaugeVec::new(
Opts::new("triglav_external_port", "External port as seen by server"),
&["uplink_id"]
)?;
let active_flows = IntGauge::new(
"triglav_active_flows",
"Number of active flows"
)?;
let flow_bindings = IntGaugeVec::new(
Opts::new("triglav_flow_bindings", "Number of flows bound to each uplink"),
&["uplink_id"]
)?;
let scheduler_selections_total = IntCounterVec::new(
Opts::new("triglav_scheduler_selections_total", "Total scheduler selections per uplink"),
&["uplink_id", "strategy"]
)?;
let scheduler_latency_seconds = Histogram::with_opts(
HistogramOpts::new(
"triglav_scheduler_latency_seconds",
"Time spent in scheduler selection"
).buckets(vec![0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01])
)?;
let handshakes_total = IntCounter::new(
"triglav_handshakes_total",
"Total number of Noise handshakes completed"
)?;
let handshakes_failed = IntCounter::new(
"triglav_handshakes_failed",
"Total number of failed Noise handshakes"
)?;
let rekeys_total = IntCounter::new(
"triglav_rekeys_total",
"Total number of rekey operations"
)?;
let encrypt_operations = IntCounter::new(
"triglav_encrypt_operations_total",
"Total encryption operations"
)?;
let decrypt_operations = IntCounter::new(
"triglav_decrypt_operations_total",
"Total decryption operations"
)?;
let errors_total = IntCounterVec::new(
Opts::new("triglav_errors_total", "Total errors by type"),
&["error_type"]
)?;
let server_uptime_seconds = Gauge::new(
"triglav_server_uptime_seconds",
"Server uptime in seconds"
)?;
let users_total = IntGauge::new(
"triglav_users_total",
"Total registered users"
)?;
let users_active = IntGauge::new(
"triglav_users_active",
"Currently active users"
)?;
registry.register(Box::new(connections_total.clone()))?;
registry.register(Box::new(connections_active.clone()))?;
registry.register(Box::new(connections_by_user.clone()))?;
registry.register(Box::new(sessions_total.clone()))?;
registry.register(Box::new(sessions_active.clone()))?;
registry.register(Box::new(session_duration_seconds.clone()))?;
registry.register(Box::new(uplinks_total.clone()))?;
registry.register(Box::new(uplinks_active.clone()))?;
registry.register(Box::new(uplink_state.clone()))?;
registry.register(Box::new(uplink_health.clone()))?;
registry.register(Box::new(bytes_sent_total.clone()))?;
registry.register(Box::new(bytes_received_total.clone()))?;
registry.register(Box::new(packets_sent_total.clone()))?;
registry.register(Box::new(packets_received_total.clone()))?;
registry.register(Box::new(packets_dropped_total.clone()))?;
registry.register(Box::new(packets_retransmitted_total.clone()))?;
registry.register(Box::new(rtt_seconds.clone()))?;
registry.register(Box::new(rtt_histogram.clone()))?;
registry.register(Box::new(jitter_seconds.clone()))?;
registry.register(Box::new(packet_loss_ratio.clone()))?;
registry.register(Box::new(quality_score.clone()))?;
registry.register(Box::new(bandwidth_bytes_per_sec.clone()))?;
registry.register(Box::new(nat_type.clone()))?;
registry.register(Box::new(external_port.clone()))?;
registry.register(Box::new(active_flows.clone()))?;
registry.register(Box::new(flow_bindings.clone()))?;
registry.register(Box::new(scheduler_selections_total.clone()))?;
registry.register(Box::new(scheduler_latency_seconds.clone()))?;
registry.register(Box::new(handshakes_total.clone()))?;
registry.register(Box::new(handshakes_failed.clone()))?;
registry.register(Box::new(rekeys_total.clone()))?;
registry.register(Box::new(encrypt_operations.clone()))?;
registry.register(Box::new(decrypt_operations.clone()))?;
registry.register(Box::new(errors_total.clone()))?;
registry.register(Box::new(server_uptime_seconds.clone()))?;
registry.register(Box::new(users_total.clone()))?;
registry.register(Box::new(users_active.clone()))?;
Ok(Self {
registry,
connections_total,
connections_active,
connections_by_user,
sessions_total,
sessions_active,
session_duration_seconds,
uplinks_total,
uplinks_active,
uplink_state,
uplink_health,
bytes_sent_total,
bytes_received_total,
packets_sent_total,
packets_received_total,
packets_dropped_total,
packets_retransmitted_total,
rtt_seconds,
rtt_histogram,
jitter_seconds,
packet_loss_ratio,
quality_score,
bandwidth_bytes_per_sec,
nat_type,
external_port,
active_flows,
flow_bindings,
scheduler_selections_total,
scheduler_latency_seconds,
handshakes_total,
handshakes_failed,
rekeys_total,
encrypt_operations,
decrypt_operations,
errors_total,
server_uptime_seconds,
users_total,
users_active,
})
}
pub fn encode(&self) -> Result<String, prometheus::Error> {
let encoder = TextEncoder::new();
let metric_families = self.registry.gather();
let mut buffer = Vec::new();
encoder.encode(&metric_families, &mut buffer)?;
Ok(String::from_utf8(buffer).unwrap_or_default())
}
pub fn record_rtt(&self, uplink_id: &str, rtt: Duration) {
let seconds = rtt.as_secs_f64();
self.rtt_seconds.with_label_values(&[uplink_id]).set(seconds);
self.rtt_histogram.with_label_values(&[uplink_id]).observe(seconds);
}
pub fn record_packet_sent(&self, uplink_id: &str, bytes: u64) {
self.packets_sent_total.with_label_values(&[uplink_id]).inc();
self.bytes_sent_total.with_label_values(&[uplink_id]).inc_by(bytes);
}
pub fn record_packet_received(&self, uplink_id: &str, bytes: u64) {
self.packets_received_total.with_label_values(&[uplink_id]).inc();
self.bytes_received_total.with_label_values(&[uplink_id]).inc_by(bytes);
}
pub fn record_packet_dropped(&self, uplink_id: &str, reason: &str) {
self.packets_dropped_total.with_label_values(&[uplink_id, reason]).inc();
}
pub fn set_uplink_state(&self, uplink_id: &str, interface: &str, state: i64) {
self.uplink_state.with_label_values(&[uplink_id, interface]).set(state);
}
pub fn set_uplink_health(&self, uplink_id: &str, interface: &str, health: f64) {
self.uplink_health.with_label_values(&[uplink_id, interface]).set(health);
}
pub fn set_uplink_quality(&self, uplink_id: &str, loss: f64, score: f64, jitter_secs: f64) {
self.packet_loss_ratio.with_label_values(&[uplink_id]).set(loss);
self.quality_score.with_label_values(&[uplink_id]).set(score);
self.jitter_seconds.with_label_values(&[uplink_id]).set(jitter_secs);
}
pub fn set_bandwidth(&self, uplink_id: &str, send_bps: f64, recv_bps: f64) {
self.bandwidth_bytes_per_sec.with_label_values(&[uplink_id, "send"]).set(send_bps);
self.bandwidth_bytes_per_sec.with_label_values(&[uplink_id, "recv"]).set(recv_bps);
}
pub fn record_error(&self, error_type: &str) {
self.errors_total.with_label_values(&[error_type]).inc();
}
pub fn record_scheduler_selection(&self, uplink_id: &str, strategy: &str, latency: Duration) {
self.scheduler_selections_total.with_label_values(&[uplink_id, strategy]).inc();
self.scheduler_latency_seconds.observe(latency.as_secs_f64());
}
}
impl Default for PrometheusMetrics {
fn default() -> Self {
Self::new().expect("Failed to create Prometheus metrics")
}
}
static METRICS: std::sync::OnceLock<Arc<PrometheusMetrics>> = std::sync::OnceLock::new();
pub fn init_metrics() -> Arc<PrometheusMetrics> {
METRICS.get_or_init(|| {
Arc::new(PrometheusMetrics::new().expect("Failed to initialize metrics"))
}).clone()
}
pub fn get_metrics() -> Option<Arc<PrometheusMetrics>> {
METRICS.get().cloned()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_metrics_creation() {
let metrics = PrometheusMetrics::new().unwrap();
metrics.connections_total.inc();
metrics.record_rtt("uplink-1", Duration::from_millis(50));
metrics.record_packet_sent("uplink-1", 1000);
let output = metrics.encode().unwrap();
assert!(output.contains("triglav_connections_total"));
assert!(output.contains("triglav_rtt_seconds"));
}
#[test]
fn test_global_metrics() {
let m1 = init_metrics();
let m2 = init_metrics();
assert!(Arc::ptr_eq(&m1, &m2));
}
}