guts_node/observability/
metrics.rs

1//! Prometheus metrics collection.
2//!
3//! Provides comprehensive metrics for:
4//! - HTTP request latency and counts
5//! - P2P message statistics
6//! - Storage operations
7//! - Business metrics (repos, PRs, issues)
8
9use once_cell::sync::Lazy;
10use parking_lot::RwLock;
11use prometheus_client::encoding::EncodeLabelSet;
12use prometheus_client::metrics::counter::Counter;
13use prometheus_client::metrics::family::Family;
14use prometheus_client::metrics::gauge::Gauge;
15use prometheus_client::metrics::histogram::{exponential_buckets, Histogram};
16use prometheus_client::registry::Registry;
17use std::sync::Arc;
18
19/// HTTP request labels.
20#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
21pub struct HttpLabels {
22    /// HTTP method (GET, POST, etc.)
23    pub method: String,
24    /// Request path pattern
25    pub path: String,
26    /// Response status code
27    pub status: u16,
28}
29
30/// P2P message labels.
31#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
32pub struct P2pLabels {
33    /// Message type
34    pub message_type: String,
35    /// Direction (sent/received)
36    pub direction: String,
37}
38
39/// Storage operation labels.
40#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
41pub struct StorageLabels {
42    /// Operation type (read, write, delete)
43    pub operation: String,
44    /// Object type (blob, tree, commit)
45    pub object_type: String,
46}
47
48/// Global metrics state.
49pub static METRICS: Lazy<MetricsState> = Lazy::new(MetricsState::new);
50
51/// Metrics state container.
52#[derive(Clone)]
53pub struct MetricsState {
54    /// Prometheus registry.
55    pub registry: Arc<RwLock<Registry>>,
56    /// HTTP request counter.
57    pub http_requests_total: Family<HttpLabels, Counter>,
58    /// HTTP request duration histogram (seconds).
59    pub http_request_duration_seconds: Family<HttpLabels, Histogram>,
60    /// HTTP active connections gauge.
61    pub http_active_connections: Gauge,
62    /// P2P connected peers gauge.
63    pub p2p_peers_connected: Gauge,
64    /// P2P messages counter.
65    pub p2p_messages_total: Family<P2pLabels, Counter>,
66    /// P2P message latency histogram.
67    pub p2p_message_latency_seconds: Family<P2pLabels, Histogram>,
68    /// Storage objects gauge by type.
69    pub storage_objects_total: Family<StorageLabels, Gauge>,
70    /// Storage operation duration histogram.
71    pub storage_operation_duration_seconds: Family<StorageLabels, Histogram>,
72    /// Total repositories gauge.
73    pub repositories_total: Gauge,
74    /// Pull requests by state.
75    pub pull_requests_total: Gauge,
76    /// Issues by state.
77    pub issues_total: Gauge,
78    /// Total users.
79    pub users_total: Gauge,
80    /// Total organizations.
81    pub organizations_total: Gauge,
82    /// WebSocket active connections.
83    pub websocket_connections: Gauge,
84}
85
86impl Default for MetricsState {
87    fn default() -> Self {
88        Self::new()
89    }
90}
91
92impl MetricsState {
93    /// Create a new metrics state with all metrics registered.
94    pub fn new() -> Self {
95        let mut registry = Registry::default();
96
97        // HTTP metrics
98        let http_requests_total = Family::<HttpLabels, Counter>::default();
99        registry.register(
100            "guts_http_requests",
101            "Total HTTP requests",
102            http_requests_total.clone(),
103        );
104
105        let http_request_duration_seconds =
106            Family::<HttpLabels, Histogram>::new_with_constructor(|| {
107                Histogram::new(exponential_buckets(0.001, 2.0, 16))
108            });
109        registry.register(
110            "guts_http_request_duration_seconds",
111            "HTTP request duration in seconds",
112            http_request_duration_seconds.clone(),
113        );
114
115        let http_active_connections = Gauge::default();
116        registry.register(
117            "guts_http_active_connections",
118            "Number of active HTTP connections",
119            http_active_connections.clone(),
120        );
121
122        // P2P metrics
123        let p2p_peers_connected = Gauge::default();
124        registry.register(
125            "guts_p2p_peers_connected",
126            "Number of connected P2P peers",
127            p2p_peers_connected.clone(),
128        );
129
130        let p2p_messages_total = Family::<P2pLabels, Counter>::default();
131        registry.register(
132            "guts_p2p_messages",
133            "Total P2P messages",
134            p2p_messages_total.clone(),
135        );
136
137        let p2p_message_latency_seconds =
138            Family::<P2pLabels, Histogram>::new_with_constructor(|| {
139                Histogram::new(exponential_buckets(0.001, 2.0, 16))
140            });
141        registry.register(
142            "guts_p2p_message_latency_seconds",
143            "P2P message latency in seconds",
144            p2p_message_latency_seconds.clone(),
145        );
146
147        // Storage metrics
148        let storage_objects_total = Family::<StorageLabels, Gauge>::default();
149        registry.register(
150            "guts_storage_objects",
151            "Total storage objects by type",
152            storage_objects_total.clone(),
153        );
154
155        let storage_operation_duration_seconds =
156            Family::<StorageLabels, Histogram>::new_with_constructor(|| {
157                Histogram::new(exponential_buckets(0.0001, 2.0, 16))
158            });
159        registry.register(
160            "guts_storage_operation_duration_seconds",
161            "Storage operation duration in seconds",
162            storage_operation_duration_seconds.clone(),
163        );
164
165        // Business metrics
166        let repositories_total = Gauge::default();
167        registry.register(
168            "guts_repositories",
169            "Total number of repositories",
170            repositories_total.clone(),
171        );
172
173        let pull_requests_total = Gauge::default();
174        registry.register(
175            "guts_pull_requests",
176            "Total number of pull requests",
177            pull_requests_total.clone(),
178        );
179
180        let issues_total = Gauge::default();
181        registry.register(
182            "guts_issues",
183            "Total number of issues",
184            issues_total.clone(),
185        );
186
187        let users_total = Gauge::default();
188        registry.register("guts_users", "Total number of users", users_total.clone());
189
190        let organizations_total = Gauge::default();
191        registry.register(
192            "guts_organizations",
193            "Total number of organizations",
194            organizations_total.clone(),
195        );
196
197        let websocket_connections = Gauge::default();
198        registry.register(
199            "guts_websocket_connections",
200            "Active WebSocket connections",
201            websocket_connections.clone(),
202        );
203
204        Self {
205            registry: Arc::new(RwLock::new(registry)),
206            http_requests_total,
207            http_request_duration_seconds,
208            http_active_connections,
209            p2p_peers_connected,
210            p2p_messages_total,
211            p2p_message_latency_seconds,
212            storage_objects_total,
213            storage_operation_duration_seconds,
214            repositories_total,
215            pull_requests_total,
216            issues_total,
217            users_total,
218            organizations_total,
219            websocket_connections,
220        }
221    }
222
223    /// Record an HTTP request.
224    pub fn record_http_request(&self, method: &str, path: &str, status: u16, duration_secs: f64) {
225        let labels = HttpLabels {
226            method: method.to_string(),
227            path: normalize_path(path),
228            status,
229        };
230
231        self.http_requests_total.get_or_create(&labels).inc();
232        self.http_request_duration_seconds
233            .get_or_create(&labels)
234            .observe(duration_secs);
235    }
236
237    /// Record a P2P message.
238    pub fn record_p2p_message(&self, message_type: &str, direction: &str, latency_secs: f64) {
239        let labels = P2pLabels {
240            message_type: message_type.to_string(),
241            direction: direction.to_string(),
242        };
243
244        self.p2p_messages_total.get_or_create(&labels).inc();
245        if latency_secs > 0.0 {
246            self.p2p_message_latency_seconds
247                .get_or_create(&labels)
248                .observe(latency_secs);
249        }
250    }
251
252    /// Encode metrics for Prometheus scraping.
253    pub fn encode(&self) -> String {
254        let mut buffer = String::new();
255        let registry = self.registry.read();
256        prometheus_client::encoding::text::encode(&mut buffer, &registry)
257            .expect("Failed to encode metrics");
258        buffer
259    }
260}
261
262/// Normalize path for metrics (replace dynamic segments).
263fn normalize_path(path: &str) -> String {
264    // Replace common dynamic path segments with placeholders
265    let parts: Vec<&str> = path.split('/').collect();
266    let normalized: Vec<&str> = parts
267        .iter()
268        .enumerate()
269        .map(|(i, part)| {
270            // Skip empty parts and keep static paths
271            if part.is_empty() {
272                return *part;
273            }
274            // Detect dynamic segments (UUIDs, numbers, owner/repo patterns)
275            if is_dynamic_segment(part, i, &parts) {
276                ":param"
277            } else {
278                *part
279            }
280        })
281        .collect();
282    normalized.join("/")
283}
284
285/// Check if a path segment is dynamic.
286fn is_dynamic_segment(segment: &str, index: usize, parts: &[&str]) -> bool {
287    // UUID pattern
288    if segment.len() == 36 && segment.contains('-') {
289        return true;
290    }
291    // Pure numeric
292    if segment.chars().all(|c| c.is_ascii_digit()) {
293        return true;
294    }
295    // After /repos or /git, next two segments are owner/name
296    if index >= 2 {
297        if let Some(parent) = parts.get(index - 2) {
298            if *parent == "repos" || *parent == "git" {
299                return true;
300            }
301        }
302    }
303    if index >= 1 {
304        if let Some(parent) = parts.get(index - 1) {
305            if *parent == "repos" || *parent == "git" {
306                return true;
307            }
308        }
309    }
310    false
311}
312
313#[cfg(test)]
314mod tests {
315    use super::*;
316
317    #[test]
318    fn test_normalize_path() {
319        assert_eq!(normalize_path("/health"), "/health");
320        assert_eq!(normalize_path("/api/repos"), "/api/repos");
321        assert_eq!(
322            normalize_path("/api/repos/alice/myrepo"),
323            "/api/repos/:param/:param"
324        );
325        assert_eq!(
326            normalize_path("/git/alice/myrepo/info/refs"),
327            "/git/:param/:param/info/refs"
328        );
329    }
330
331    #[test]
332    fn test_metrics_state_creation() {
333        let metrics = MetricsState::new();
334        metrics.record_http_request("GET", "/health", 200, 0.001);
335        let encoded = metrics.encode();
336        assert!(encoded.contains("guts_http_requests"));
337    }
338}