1pub mod collectors;
7pub mod health;
8
9use prometheus::{
10 Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, Opts, Registry,
11};
12use std::sync::Arc;
13use thiserror::Error;
14
15pub use collectors::*;
16pub use health::*;
17
18#[derive(Error, Debug)]
19pub enum MetricsError {
20 #[error("Prometheus error: {0}")]
21 PrometheusError(#[from] prometheus::Error),
22
23 #[error("Metrics not initialized")]
24 NotInitialized,
25
26 #[error("Health check failed: {0}")]
27 HealthCheckFailed(String),
28}
29
30pub type Result<T> = std::result::Result<T, MetricsError>;
31
32pub struct MetricsRegistry {
34 registry: Arc<Registry>,
35 config_metrics: ConfigMetrics,
36 cache_metrics: CacheMetrics,
37 rbac_metrics: RbacMetrics,
38 audit_metrics: AuditMetrics,
39 storage_metrics: StorageMetrics,
40 crypto_metrics: CryptoMetrics,
41 system_metrics: SystemMetrics,
42}
43
44impl MetricsRegistry {
45 pub fn new() -> Result<Self> {
47 let registry = Arc::new(Registry::new());
48
49 Ok(Self {
50 config_metrics: ConfigMetrics::new(Arc::clone(®istry))?,
51 cache_metrics: CacheMetrics::new(Arc::clone(®istry))?,
52 rbac_metrics: RbacMetrics::new(Arc::clone(®istry))?,
53 audit_metrics: AuditMetrics::new(Arc::clone(®istry))?,
54 storage_metrics: StorageMetrics::new(Arc::clone(®istry))?,
55 crypto_metrics: CryptoMetrics::new(Arc::clone(®istry))?,
56 system_metrics: SystemMetrics::new(Arc::clone(®istry))?,
57 registry,
58 })
59 }
60
61 pub fn registry(&self) -> Arc<Registry> {
63 Arc::clone(&self.registry)
64 }
65
66 pub fn config(&self) -> &ConfigMetrics {
68 &self.config_metrics
69 }
70
71 pub fn cache(&self) -> &CacheMetrics {
73 &self.cache_metrics
74 }
75
76 pub fn rbac(&self) -> &RbacMetrics {
78 &self.rbac_metrics
79 }
80
81 pub fn audit(&self) -> &AuditMetrics {
83 &self.audit_metrics
84 }
85
86 pub fn storage(&self) -> &StorageMetrics {
88 &self.storage_metrics
89 }
90
91 pub fn crypto(&self) -> &CryptoMetrics {
93 &self.crypto_metrics
94 }
95
96 pub fn system(&self) -> &SystemMetrics {
98 &self.system_metrics
99 }
100
101 pub fn gather(&self) -> Vec<prometheus::proto::MetricFamily> {
103 self.registry.gather()
104 }
105}
106
107impl Default for MetricsRegistry {
108 fn default() -> Self {
109 Self::new().expect("Failed to create metrics registry")
110 }
111}
112
113pub struct ConfigMetrics {
115 operations_total: CounterVec,
116 operation_duration: HistogramVec,
117 active_configs: GaugeVec,
118 errors_total: CounterVec,
119}
120
121impl ConfigMetrics {
122 fn new(registry: Arc<Registry>) -> Result<Self> {
123 let operations_total = CounterVec::new(
124 Opts::new(
125 "config_operations_total",
126 "Total number of configuration operations",
127 ),
128 &["operation", "environment"],
129 )?;
130
131 let operation_duration = HistogramVec::new(
132 prometheus::HistogramOpts::new(
133 "config_operation_duration_seconds",
134 "Configuration operation duration in seconds",
135 )
136 .buckets(vec![0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0]),
137 &["operation"],
138 )?;
139
140 let active_configs = GaugeVec::new(
141 Opts::new(
142 "config_active_total",
143 "Number of active configurations",
144 ),
145 &["namespace", "environment"],
146 )?;
147
148 let errors_total = CounterVec::new(
149 Opts::new(
150 "config_errors_total",
151 "Total number of configuration errors",
152 ),
153 &["error_type", "operation"],
154 )?;
155
156 registry.register(Box::new(operations_total.clone()))?;
157 registry.register(Box::new(operation_duration.clone()))?;
158 registry.register(Box::new(active_configs.clone()))?;
159 registry.register(Box::new(errors_total.clone()))?;
160
161 Ok(Self {
162 operations_total,
163 operation_duration,
164 active_configs,
165 errors_total,
166 })
167 }
168
169 pub fn record_operation(&self, operation: &str, environment: &str) {
170 self.operations_total
171 .with_label_values(&[operation, environment])
172 .inc();
173 }
174
175 pub fn observe_duration(&self, operation: &str, duration: f64) {
176 self.operation_duration
177 .with_label_values(&[operation])
178 .observe(duration);
179 }
180
181 pub fn set_active_configs(&self, namespace: &str, environment: &str, count: i64) {
182 self.active_configs
183 .with_label_values(&[namespace, environment])
184 .set(count as f64);
185 }
186
187 pub fn record_error(&self, error_type: &str, operation: &str) {
188 self.errors_total
189 .with_label_values(&[error_type, operation])
190 .inc();
191 }
192}
193
194pub struct CacheMetrics {
196 hits_total: CounterVec,
197 misses_total: CounterVec,
198 evictions_total: CounterVec,
199 size: GaugeVec,
200 operation_duration: HistogramVec,
201}
202
203impl CacheMetrics {
204 fn new(registry: Arc<Registry>) -> Result<Self> {
205 let hits_total = CounterVec::new(
206 Opts::new("cache_hits_total", "Total cache hits"),
207 &["tier"],
208 )?;
209
210 let misses_total = CounterVec::new(
211 Opts::new("cache_misses_total", "Total cache misses"),
212 &["tier"],
213 )?;
214
215 let evictions_total = CounterVec::new(
216 Opts::new("cache_evictions_total", "Total cache evictions"),
217 &["tier"],
218 )?;
219
220 let size = GaugeVec::new(
221 Opts::new("cache_size_entries", "Current cache size in entries"),
222 &["tier"],
223 )?;
224
225 let operation_duration = HistogramVec::new(
226 prometheus::HistogramOpts::new(
227 "cache_operation_duration_seconds",
228 "Cache operation duration in seconds",
229 )
230 .buckets(vec![0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]),
231 &["operation", "tier"],
232 )?;
233
234 registry.register(Box::new(hits_total.clone()))?;
235 registry.register(Box::new(misses_total.clone()))?;
236 registry.register(Box::new(evictions_total.clone()))?;
237 registry.register(Box::new(size.clone()))?;
238 registry.register(Box::new(operation_duration.clone()))?;
239
240 Ok(Self {
241 hits_total,
242 misses_total,
243 evictions_total,
244 size,
245 operation_duration,
246 })
247 }
248
249 pub fn record_hit(&self, tier: &str) {
250 self.hits_total.with_label_values(&[tier]).inc();
251 }
252
253 pub fn record_miss(&self, tier: &str) {
254 self.misses_total.with_label_values(&[tier]).inc();
255 }
256
257 pub fn record_eviction(&self, tier: &str) {
258 self.evictions_total.with_label_values(&[tier]).inc();
259 }
260
261 pub fn set_size(&self, tier: &str, size: usize) {
262 self.size.with_label_values(&[tier]).set(size as f64);
263 }
264
265 pub fn observe_duration(&self, operation: &str, tier: &str, duration: f64) {
266 self.operation_duration
267 .with_label_values(&[operation, tier])
268 .observe(duration);
269 }
270
271 pub fn hit_rate(&self, tier: &str) -> f64 {
272 let hits = self.hits_total.with_label_values(&[tier]).get();
273 let misses = self.misses_total.with_label_values(&[tier]).get();
274 if hits + misses == 0.0 {
275 0.0
276 } else {
277 hits / (hits + misses)
278 }
279 }
280}
281
282pub struct RbacMetrics {
284 permission_checks_total: CounterVec,
285 permission_denials_total: CounterVec,
286 check_duration: HistogramVec,
287 active_roles: GaugeVec,
288}
289
290impl RbacMetrics {
291 fn new(registry: Arc<Registry>) -> Result<Self> {
292 let permission_checks_total = CounterVec::new(
293 Opts::new(
294 "rbac_permission_checks_total",
295 "Total permission checks",
296 ),
297 &["resource", "action", "result"],
298 )?;
299
300 let permission_denials_total = CounterVec::new(
301 Opts::new(
302 "rbac_permission_denials_total",
303 "Total permission denials",
304 ),
305 &["resource", "action"],
306 )?;
307
308 let check_duration = HistogramVec::new(
309 prometheus::HistogramOpts::new(
310 "rbac_check_duration_seconds",
311 "Permission check duration in seconds",
312 )
313 .buckets(vec![0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005]),
314 &["resource"],
315 )?;
316
317 let active_roles = GaugeVec::new(
318 Opts::new("rbac_active_roles_total", "Number of active role assignments"),
319 &["role"],
320 )?;
321
322 registry.register(Box::new(permission_checks_total.clone()))?;
323 registry.register(Box::new(permission_denials_total.clone()))?;
324 registry.register(Box::new(check_duration.clone()))?;
325 registry.register(Box::new(active_roles.clone()))?;
326
327 Ok(Self {
328 permission_checks_total,
329 permission_denials_total,
330 check_duration,
331 active_roles,
332 })
333 }
334
335 pub fn record_permission_check(&self, resource: &str, action: &str, allowed: bool) {
336 let result = if allowed { "allowed" } else { "denied" };
337 self.permission_checks_total
338 .with_label_values(&[resource, action, result])
339 .inc();
340
341 if !allowed {
342 self.permission_denials_total
343 .with_label_values(&[resource, action])
344 .inc();
345 }
346 }
347
348 pub fn observe_check_duration(&self, resource: &str, duration: f64) {
349 self.check_duration
350 .with_label_values(&[resource])
351 .observe(duration);
352 }
353
354 pub fn set_active_roles(&self, role: &str, count: usize) {
355 self.active_roles
356 .with_label_values(&[role])
357 .set(count as f64);
358 }
359}
360
361pub struct AuditMetrics {
363 events_total: CounterVec,
364 events_by_user: CounterVec,
365 event_processing_duration: Histogram,
366 queue_size: Gauge,
367}
368
369impl AuditMetrics {
370 fn new(registry: Arc<Registry>) -> Result<Self> {
371 let events_total = CounterVec::new(
372 Opts::new("audit_events_total", "Total audit events"),
373 &["event_type"],
374 )?;
375
376 let events_by_user = CounterVec::new(
377 Opts::new("audit_events_by_user_total", "Audit events by user"),
378 &["user"],
379 )?;
380
381 let event_processing_duration = Histogram::with_opts(
382 prometheus::HistogramOpts::new(
383 "audit_event_processing_duration_seconds",
384 "Audit event processing duration",
385 )
386 .buckets(vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5]),
387 )?;
388
389 let queue_size = Gauge::new(
390 "audit_queue_size",
391 "Current audit event queue size",
392 )?;
393
394 registry.register(Box::new(events_total.clone()))?;
395 registry.register(Box::new(events_by_user.clone()))?;
396 registry.register(Box::new(event_processing_duration.clone()))?;
397 registry.register(Box::new(queue_size.clone()))?;
398
399 Ok(Self {
400 events_total,
401 events_by_user,
402 event_processing_duration,
403 queue_size,
404 })
405 }
406
407 pub fn record_event(&self, event_type: &str, user: &str) {
408 self.events_total.with_label_values(&[event_type]).inc();
409 self.events_by_user.with_label_values(&[user]).inc();
410 }
411
412 pub fn observe_processing_duration(&self, duration: f64) {
413 self.event_processing_duration.observe(duration);
414 }
415
416 pub fn set_queue_size(&self, size: usize) {
417 self.queue_size.set(size as f64);
418 }
419}
420
421pub struct StorageMetrics {
423 operations_total: CounterVec,
424 operation_duration: HistogramVec,
425 storage_size_bytes: Gauge,
426 errors_total: CounterVec,
427}
428
429impl StorageMetrics {
430 fn new(registry: Arc<Registry>) -> Result<Self> {
431 let operations_total = CounterVec::new(
432 Opts::new("storage_operations_total", "Total storage operations"),
433 &["operation"],
434 )?;
435
436 let operation_duration = HistogramVec::new(
437 prometheus::HistogramOpts::new(
438 "storage_operation_duration_seconds",
439 "Storage operation duration",
440 )
441 .buckets(vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]),
442 &["operation"],
443 )?;
444
445 let storage_size_bytes = Gauge::new(
446 "storage_size_bytes",
447 "Total storage size in bytes",
448 )?;
449
450 let errors_total = CounterVec::new(
451 Opts::new("storage_errors_total", "Total storage errors"),
452 &["error_type"],
453 )?;
454
455 registry.register(Box::new(operations_total.clone()))?;
456 registry.register(Box::new(operation_duration.clone()))?;
457 registry.register(Box::new(storage_size_bytes.clone()))?;
458 registry.register(Box::new(errors_total.clone()))?;
459
460 Ok(Self {
461 operations_total,
462 operation_duration,
463 storage_size_bytes,
464 errors_total,
465 })
466 }
467
468 pub fn record_operation(&self, operation: &str) {
469 self.operations_total
470 .with_label_values(&[operation])
471 .inc();
472 }
473
474 pub fn observe_duration(&self, operation: &str, duration: f64) {
475 self.operation_duration
476 .with_label_values(&[operation])
477 .observe(duration);
478 }
479
480 pub fn set_size(&self, size_bytes: u64) {
481 self.storage_size_bytes.set(size_bytes as f64);
482 }
483
484 pub fn record_error(&self, error_type: &str) {
485 self.errors_total.with_label_values(&[error_type]).inc();
486 }
487}
488
489pub struct CryptoMetrics {
491 operations_total: CounterVec,
492 operation_duration: HistogramVec,
493 key_rotations_total: Counter,
494 encryption_errors_total: Counter,
495}
496
497impl CryptoMetrics {
498 fn new(registry: Arc<Registry>) -> Result<Self> {
499 let operations_total = CounterVec::new(
500 Opts::new("crypto_operations_total", "Total crypto operations"),
501 &["operation", "algorithm"],
502 )?;
503
504 let operation_duration = HistogramVec::new(
505 prometheus::HistogramOpts::new(
506 "crypto_operation_duration_seconds",
507 "Crypto operation duration",
508 )
509 .buckets(vec![0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01]),
510 &["operation"],
511 )?;
512
513 let key_rotations_total = Counter::new(
514 "crypto_key_rotations_total",
515 "Total key rotations",
516 )?;
517
518 let encryption_errors_total = Counter::new(
519 "crypto_encryption_errors_total",
520 "Total encryption errors",
521 )?;
522
523 registry.register(Box::new(operations_total.clone()))?;
524 registry.register(Box::new(operation_duration.clone()))?;
525 registry.register(Box::new(key_rotations_total.clone()))?;
526 registry.register(Box::new(encryption_errors_total.clone()))?;
527
528 Ok(Self {
529 operations_total,
530 operation_duration,
531 key_rotations_total,
532 encryption_errors_total,
533 })
534 }
535
536 pub fn record_operation(&self, operation: &str, algorithm: &str) {
537 self.operations_total
538 .with_label_values(&[operation, algorithm])
539 .inc();
540 }
541
542 pub fn observe_duration(&self, operation: &str, duration: f64) {
543 self.operation_duration
544 .with_label_values(&[operation])
545 .observe(duration);
546 }
547
548 pub fn record_key_rotation(&self) {
549 self.key_rotations_total.inc();
550 }
551
552 pub fn record_encryption_error(&self) {
553 self.encryption_errors_total.inc();
554 }
555}
556
557pub struct SystemMetrics {
559 uptime_seconds: Gauge,
560 memory_usage_bytes: Gauge,
561 goroutines: Gauge,
562 http_requests_total: CounterVec,
563 http_request_duration: HistogramVec,
564}
565
566impl SystemMetrics {
567 fn new(registry: Arc<Registry>) -> Result<Self> {
568 let uptime_seconds = Gauge::new(
569 "system_uptime_seconds",
570 "System uptime in seconds",
571 )?;
572
573 let memory_usage_bytes = Gauge::new(
574 "system_memory_usage_bytes",
575 "Current memory usage in bytes",
576 )?;
577
578 let goroutines = Gauge::new(
579 "system_goroutines",
580 "Number of goroutines",
581 )?;
582
583 let http_requests_total = CounterVec::new(
584 Opts::new("http_requests_total", "Total HTTP requests"),
585 &["method", "path", "status"],
586 )?;
587
588 let http_request_duration = HistogramVec::new(
589 prometheus::HistogramOpts::new(
590 "http_request_duration_seconds",
591 "HTTP request duration",
592 )
593 .buckets(vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0]),
594 &["method", "path"],
595 )?;
596
597 registry.register(Box::new(uptime_seconds.clone()))?;
598 registry.register(Box::new(memory_usage_bytes.clone()))?;
599 registry.register(Box::new(goroutines.clone()))?;
600 registry.register(Box::new(http_requests_total.clone()))?;
601 registry.register(Box::new(http_request_duration.clone()))?;
602
603 Ok(Self {
604 uptime_seconds,
605 memory_usage_bytes,
606 goroutines,
607 http_requests_total,
608 http_request_duration,
609 })
610 }
611
612 pub fn set_uptime(&self, seconds: f64) {
613 self.uptime_seconds.set(seconds);
614 }
615
616 pub fn set_memory_usage(&self, bytes: u64) {
617 self.memory_usage_bytes.set(bytes as f64);
618 }
619
620 pub fn set_goroutines(&self, count: usize) {
621 self.goroutines.set(count as f64);
622 }
623
624 pub fn record_http_request(&self, method: &str, path: &str, status: u16) {
625 self.http_requests_total
626 .with_label_values(&[method, path, &status.to_string()])
627 .inc();
628 }
629
630 pub fn observe_http_duration(&self, method: &str, path: &str, duration: f64) {
631 self.http_request_duration
632 .with_label_values(&[method, path])
633 .observe(duration);
634 }
635}
636
637#[cfg(test)]
638mod tests {
639 use super::*;
640
641 #[test]
642 fn test_metrics_registry_creation() {
643 let registry = MetricsRegistry::new().unwrap();
644 assert!(!registry.gather().is_empty());
645 }
646
647 #[test]
648 fn test_config_metrics() {
649 let registry = MetricsRegistry::new().unwrap();
650
651 registry.config().record_operation("set", "production");
652 registry.config().observe_duration("set", 0.005);
653 registry.config().set_active_configs("test/ns", "production", 42);
654 registry.config().record_error("validation", "set");
655
656 let metrics = registry.gather();
657 assert!(!metrics.is_empty());
658 }
659
660 #[test]
661 fn test_cache_metrics() {
662 let registry = MetricsRegistry::new().unwrap();
663
664 registry.cache().record_hit("l1");
665 registry.cache().record_miss("l1");
666 registry.cache().set_size("l1", 100);
667
668 let hit_rate = registry.cache().hit_rate("l1");
669 assert!((hit_rate - 0.5).abs() < 0.01); }
671
672 #[test]
673 fn test_rbac_metrics() {
674 let registry = MetricsRegistry::new().unwrap();
675
676 registry.rbac().record_permission_check("config", "read", true);
677 registry.rbac().record_permission_check("config", "write", false);
678 registry.rbac().observe_check_duration("config", 0.0001);
679 registry.rbac().set_active_roles("admin", 5);
680
681 let metrics = registry.gather();
682 assert!(!metrics.is_empty());
683 }
684}