llm_config_metrics/
lib.rs

1//! Metrics and monitoring for LLM Config Manager
2//!
3//! This crate provides comprehensive metrics collection using Prometheus.
4//! It includes metrics for all major subsystems and health checks.
5
6pub mod collectors;
7pub mod health;
8
9use prometheus::{
10    Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, Opts, Registry,
11};
12use std::sync::Arc;
13use thiserror::Error;
14
15pub use collectors::*;
16pub use health::*;
17
18#[derive(Error, Debug)]
19pub enum MetricsError {
20    #[error("Prometheus error: {0}")]
21    PrometheusError(#[from] prometheus::Error),
22
23    #[error("Metrics not initialized")]
24    NotInitialized,
25
26    #[error("Health check failed: {0}")]
27    HealthCheckFailed(String),
28}
29
30pub type Result<T> = std::result::Result<T, MetricsError>;
31
32/// Global metrics registry
33pub struct MetricsRegistry {
34    registry: Arc<Registry>,
35    config_metrics: ConfigMetrics,
36    cache_metrics: CacheMetrics,
37    rbac_metrics: RbacMetrics,
38    audit_metrics: AuditMetrics,
39    storage_metrics: StorageMetrics,
40    crypto_metrics: CryptoMetrics,
41    system_metrics: SystemMetrics,
42}
43
44impl MetricsRegistry {
45    /// Create a new metrics registry
46    pub fn new() -> Result<Self> {
47        let registry = Arc::new(Registry::new());
48
49        Ok(Self {
50            config_metrics: ConfigMetrics::new(Arc::clone(&registry))?,
51            cache_metrics: CacheMetrics::new(Arc::clone(&registry))?,
52            rbac_metrics: RbacMetrics::new(Arc::clone(&registry))?,
53            audit_metrics: AuditMetrics::new(Arc::clone(&registry))?,
54            storage_metrics: StorageMetrics::new(Arc::clone(&registry))?,
55            crypto_metrics: CryptoMetrics::new(Arc::clone(&registry))?,
56            system_metrics: SystemMetrics::new(Arc::clone(&registry))?,
57            registry,
58        })
59    }
60
61    /// Get the Prometheus registry
62    pub fn registry(&self) -> Arc<Registry> {
63        Arc::clone(&self.registry)
64    }
65
66    /// Get configuration metrics
67    pub fn config(&self) -> &ConfigMetrics {
68        &self.config_metrics
69    }
70
71    /// Get cache metrics
72    pub fn cache(&self) -> &CacheMetrics {
73        &self.cache_metrics
74    }
75
76    /// Get RBAC metrics
77    pub fn rbac(&self) -> &RbacMetrics {
78        &self.rbac_metrics
79    }
80
81    /// Get audit metrics
82    pub fn audit(&self) -> &AuditMetrics {
83        &self.audit_metrics
84    }
85
86    /// Get storage metrics
87    pub fn storage(&self) -> &StorageMetrics {
88        &self.storage_metrics
89    }
90
91    /// Get crypto metrics
92    pub fn crypto(&self) -> &CryptoMetrics {
93        &self.crypto_metrics
94    }
95
96    /// Get system metrics
97    pub fn system(&self) -> &SystemMetrics {
98        &self.system_metrics
99    }
100
101    /// Gather all metrics in Prometheus format
102    pub fn gather(&self) -> Vec<prometheus::proto::MetricFamily> {
103        self.registry.gather()
104    }
105}
106
107impl Default for MetricsRegistry {
108    fn default() -> Self {
109        Self::new().expect("Failed to create metrics registry")
110    }
111}
112
113/// Configuration operation metrics
114pub struct ConfigMetrics {
115    operations_total: CounterVec,
116    operation_duration: HistogramVec,
117    active_configs: GaugeVec,
118    errors_total: CounterVec,
119}
120
121impl ConfigMetrics {
122    fn new(registry: Arc<Registry>) -> Result<Self> {
123        let operations_total = CounterVec::new(
124            Opts::new(
125                "config_operations_total",
126                "Total number of configuration operations",
127            ),
128            &["operation", "environment"],
129        )?;
130
131        let operation_duration = HistogramVec::new(
132            prometheus::HistogramOpts::new(
133                "config_operation_duration_seconds",
134                "Configuration operation duration in seconds",
135            )
136            .buckets(vec![0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0]),
137            &["operation"],
138        )?;
139
140        let active_configs = GaugeVec::new(
141            Opts::new(
142                "config_active_total",
143                "Number of active configurations",
144            ),
145            &["namespace", "environment"],
146        )?;
147
148        let errors_total = CounterVec::new(
149            Opts::new(
150                "config_errors_total",
151                "Total number of configuration errors",
152            ),
153            &["error_type", "operation"],
154        )?;
155
156        registry.register(Box::new(operations_total.clone()))?;
157        registry.register(Box::new(operation_duration.clone()))?;
158        registry.register(Box::new(active_configs.clone()))?;
159        registry.register(Box::new(errors_total.clone()))?;
160
161        Ok(Self {
162            operations_total,
163            operation_duration,
164            active_configs,
165            errors_total,
166        })
167    }
168
169    pub fn record_operation(&self, operation: &str, environment: &str) {
170        self.operations_total
171            .with_label_values(&[operation, environment])
172            .inc();
173    }
174
175    pub fn observe_duration(&self, operation: &str, duration: f64) {
176        self.operation_duration
177            .with_label_values(&[operation])
178            .observe(duration);
179    }
180
181    pub fn set_active_configs(&self, namespace: &str, environment: &str, count: i64) {
182        self.active_configs
183            .with_label_values(&[namespace, environment])
184            .set(count as f64);
185    }
186
187    pub fn record_error(&self, error_type: &str, operation: &str) {
188        self.errors_total
189            .with_label_values(&[error_type, operation])
190            .inc();
191    }
192}
193
194/// Cache metrics
195pub struct CacheMetrics {
196    hits_total: CounterVec,
197    misses_total: CounterVec,
198    evictions_total: CounterVec,
199    size: GaugeVec,
200    operation_duration: HistogramVec,
201}
202
203impl CacheMetrics {
204    fn new(registry: Arc<Registry>) -> Result<Self> {
205        let hits_total = CounterVec::new(
206            Opts::new("cache_hits_total", "Total cache hits"),
207            &["tier"],
208        )?;
209
210        let misses_total = CounterVec::new(
211            Opts::new("cache_misses_total", "Total cache misses"),
212            &["tier"],
213        )?;
214
215        let evictions_total = CounterVec::new(
216            Opts::new("cache_evictions_total", "Total cache evictions"),
217            &["tier"],
218        )?;
219
220        let size = GaugeVec::new(
221            Opts::new("cache_size_entries", "Current cache size in entries"),
222            &["tier"],
223        )?;
224
225        let operation_duration = HistogramVec::new(
226            prometheus::HistogramOpts::new(
227                "cache_operation_duration_seconds",
228                "Cache operation duration in seconds",
229            )
230            .buckets(vec![0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]),
231            &["operation", "tier"],
232        )?;
233
234        registry.register(Box::new(hits_total.clone()))?;
235        registry.register(Box::new(misses_total.clone()))?;
236        registry.register(Box::new(evictions_total.clone()))?;
237        registry.register(Box::new(size.clone()))?;
238        registry.register(Box::new(operation_duration.clone()))?;
239
240        Ok(Self {
241            hits_total,
242            misses_total,
243            evictions_total,
244            size,
245            operation_duration,
246        })
247    }
248
249    pub fn record_hit(&self, tier: &str) {
250        self.hits_total.with_label_values(&[tier]).inc();
251    }
252
253    pub fn record_miss(&self, tier: &str) {
254        self.misses_total.with_label_values(&[tier]).inc();
255    }
256
257    pub fn record_eviction(&self, tier: &str) {
258        self.evictions_total.with_label_values(&[tier]).inc();
259    }
260
261    pub fn set_size(&self, tier: &str, size: usize) {
262        self.size.with_label_values(&[tier]).set(size as f64);
263    }
264
265    pub fn observe_duration(&self, operation: &str, tier: &str, duration: f64) {
266        self.operation_duration
267            .with_label_values(&[operation, tier])
268            .observe(duration);
269    }
270
271    pub fn hit_rate(&self, tier: &str) -> f64 {
272        let hits = self.hits_total.with_label_values(&[tier]).get();
273        let misses = self.misses_total.with_label_values(&[tier]).get();
274        if hits + misses == 0.0 {
275            0.0
276        } else {
277            hits / (hits + misses)
278        }
279    }
280}
281
282/// RBAC metrics
283pub struct RbacMetrics {
284    permission_checks_total: CounterVec,
285    permission_denials_total: CounterVec,
286    check_duration: HistogramVec,
287    active_roles: GaugeVec,
288}
289
290impl RbacMetrics {
291    fn new(registry: Arc<Registry>) -> Result<Self> {
292        let permission_checks_total = CounterVec::new(
293            Opts::new(
294                "rbac_permission_checks_total",
295                "Total permission checks",
296            ),
297            &["resource", "action", "result"],
298        )?;
299
300        let permission_denials_total = CounterVec::new(
301            Opts::new(
302                "rbac_permission_denials_total",
303                "Total permission denials",
304            ),
305            &["resource", "action"],
306        )?;
307
308        let check_duration = HistogramVec::new(
309            prometheus::HistogramOpts::new(
310                "rbac_check_duration_seconds",
311                "Permission check duration in seconds",
312            )
313            .buckets(vec![0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005]),
314            &["resource"],
315        )?;
316
317        let active_roles = GaugeVec::new(
318            Opts::new("rbac_active_roles_total", "Number of active role assignments"),
319            &["role"],
320        )?;
321
322        registry.register(Box::new(permission_checks_total.clone()))?;
323        registry.register(Box::new(permission_denials_total.clone()))?;
324        registry.register(Box::new(check_duration.clone()))?;
325        registry.register(Box::new(active_roles.clone()))?;
326
327        Ok(Self {
328            permission_checks_total,
329            permission_denials_total,
330            check_duration,
331            active_roles,
332        })
333    }
334
335    pub fn record_permission_check(&self, resource: &str, action: &str, allowed: bool) {
336        let result = if allowed { "allowed" } else { "denied" };
337        self.permission_checks_total
338            .with_label_values(&[resource, action, result])
339            .inc();
340
341        if !allowed {
342            self.permission_denials_total
343                .with_label_values(&[resource, action])
344                .inc();
345        }
346    }
347
348    pub fn observe_check_duration(&self, resource: &str, duration: f64) {
349        self.check_duration
350            .with_label_values(&[resource])
351            .observe(duration);
352    }
353
354    pub fn set_active_roles(&self, role: &str, count: usize) {
355        self.active_roles
356            .with_label_values(&[role])
357            .set(count as f64);
358    }
359}
360
361/// Audit log metrics
362pub struct AuditMetrics {
363    events_total: CounterVec,
364    events_by_user: CounterVec,
365    event_processing_duration: Histogram,
366    queue_size: Gauge,
367}
368
369impl AuditMetrics {
370    fn new(registry: Arc<Registry>) -> Result<Self> {
371        let events_total = CounterVec::new(
372            Opts::new("audit_events_total", "Total audit events"),
373            &["event_type"],
374        )?;
375
376        let events_by_user = CounterVec::new(
377            Opts::new("audit_events_by_user_total", "Audit events by user"),
378            &["user"],
379        )?;
380
381        let event_processing_duration = Histogram::with_opts(
382            prometheus::HistogramOpts::new(
383                "audit_event_processing_duration_seconds",
384                "Audit event processing duration",
385            )
386            .buckets(vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5]),
387        )?;
388
389        let queue_size = Gauge::new(
390            "audit_queue_size",
391            "Current audit event queue size",
392        )?;
393
394        registry.register(Box::new(events_total.clone()))?;
395        registry.register(Box::new(events_by_user.clone()))?;
396        registry.register(Box::new(event_processing_duration.clone()))?;
397        registry.register(Box::new(queue_size.clone()))?;
398
399        Ok(Self {
400            events_total,
401            events_by_user,
402            event_processing_duration,
403            queue_size,
404        })
405    }
406
407    pub fn record_event(&self, event_type: &str, user: &str) {
408        self.events_total.with_label_values(&[event_type]).inc();
409        self.events_by_user.with_label_values(&[user]).inc();
410    }
411
412    pub fn observe_processing_duration(&self, duration: f64) {
413        self.event_processing_duration.observe(duration);
414    }
415
416    pub fn set_queue_size(&self, size: usize) {
417        self.queue_size.set(size as f64);
418    }
419}
420
421/// Storage metrics
422pub struct StorageMetrics {
423    operations_total: CounterVec,
424    operation_duration: HistogramVec,
425    storage_size_bytes: Gauge,
426    errors_total: CounterVec,
427}
428
429impl StorageMetrics {
430    fn new(registry: Arc<Registry>) -> Result<Self> {
431        let operations_total = CounterVec::new(
432            Opts::new("storage_operations_total", "Total storage operations"),
433            &["operation"],
434        )?;
435
436        let operation_duration = HistogramVec::new(
437            prometheus::HistogramOpts::new(
438                "storage_operation_duration_seconds",
439                "Storage operation duration",
440            )
441            .buckets(vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]),
442            &["operation"],
443        )?;
444
445        let storage_size_bytes = Gauge::new(
446            "storage_size_bytes",
447            "Total storage size in bytes",
448        )?;
449
450        let errors_total = CounterVec::new(
451            Opts::new("storage_errors_total", "Total storage errors"),
452            &["error_type"],
453        )?;
454
455        registry.register(Box::new(operations_total.clone()))?;
456        registry.register(Box::new(operation_duration.clone()))?;
457        registry.register(Box::new(storage_size_bytes.clone()))?;
458        registry.register(Box::new(errors_total.clone()))?;
459
460        Ok(Self {
461            operations_total,
462            operation_duration,
463            storage_size_bytes,
464            errors_total,
465        })
466    }
467
468    pub fn record_operation(&self, operation: &str) {
469        self.operations_total
470            .with_label_values(&[operation])
471            .inc();
472    }
473
474    pub fn observe_duration(&self, operation: &str, duration: f64) {
475        self.operation_duration
476            .with_label_values(&[operation])
477            .observe(duration);
478    }
479
480    pub fn set_size(&self, size_bytes: u64) {
481        self.storage_size_bytes.set(size_bytes as f64);
482    }
483
484    pub fn record_error(&self, error_type: &str) {
485        self.errors_total.with_label_values(&[error_type]).inc();
486    }
487}
488
489/// Cryptography metrics
490pub struct CryptoMetrics {
491    operations_total: CounterVec,
492    operation_duration: HistogramVec,
493    key_rotations_total: Counter,
494    encryption_errors_total: Counter,
495}
496
497impl CryptoMetrics {
498    fn new(registry: Arc<Registry>) -> Result<Self> {
499        let operations_total = CounterVec::new(
500            Opts::new("crypto_operations_total", "Total crypto operations"),
501            &["operation", "algorithm"],
502        )?;
503
504        let operation_duration = HistogramVec::new(
505            prometheus::HistogramOpts::new(
506                "crypto_operation_duration_seconds",
507                "Crypto operation duration",
508            )
509            .buckets(vec![0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01]),
510            &["operation"],
511        )?;
512
513        let key_rotations_total = Counter::new(
514            "crypto_key_rotations_total",
515            "Total key rotations",
516        )?;
517
518        let encryption_errors_total = Counter::new(
519            "crypto_encryption_errors_total",
520            "Total encryption errors",
521        )?;
522
523        registry.register(Box::new(operations_total.clone()))?;
524        registry.register(Box::new(operation_duration.clone()))?;
525        registry.register(Box::new(key_rotations_total.clone()))?;
526        registry.register(Box::new(encryption_errors_total.clone()))?;
527
528        Ok(Self {
529            operations_total,
530            operation_duration,
531            key_rotations_total,
532            encryption_errors_total,
533        })
534    }
535
536    pub fn record_operation(&self, operation: &str, algorithm: &str) {
537        self.operations_total
538            .with_label_values(&[operation, algorithm])
539            .inc();
540    }
541
542    pub fn observe_duration(&self, operation: &str, duration: f64) {
543        self.operation_duration
544            .with_label_values(&[operation])
545            .observe(duration);
546    }
547
548    pub fn record_key_rotation(&self) {
549        self.key_rotations_total.inc();
550    }
551
552    pub fn record_encryption_error(&self) {
553        self.encryption_errors_total.inc();
554    }
555}
556
557/// System-wide metrics
558pub struct SystemMetrics {
559    uptime_seconds: Gauge,
560    memory_usage_bytes: Gauge,
561    goroutines: Gauge,
562    http_requests_total: CounterVec,
563    http_request_duration: HistogramVec,
564}
565
566impl SystemMetrics {
567    fn new(registry: Arc<Registry>) -> Result<Self> {
568        let uptime_seconds = Gauge::new(
569            "system_uptime_seconds",
570            "System uptime in seconds",
571        )?;
572
573        let memory_usage_bytes = Gauge::new(
574            "system_memory_usage_bytes",
575            "Current memory usage in bytes",
576        )?;
577
578        let goroutines = Gauge::new(
579            "system_goroutines",
580            "Number of goroutines",
581        )?;
582
583        let http_requests_total = CounterVec::new(
584            Opts::new("http_requests_total", "Total HTTP requests"),
585            &["method", "path", "status"],
586        )?;
587
588        let http_request_duration = HistogramVec::new(
589            prometheus::HistogramOpts::new(
590                "http_request_duration_seconds",
591                "HTTP request duration",
592            )
593            .buckets(vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0]),
594            &["method", "path"],
595        )?;
596
597        registry.register(Box::new(uptime_seconds.clone()))?;
598        registry.register(Box::new(memory_usage_bytes.clone()))?;
599        registry.register(Box::new(goroutines.clone()))?;
600        registry.register(Box::new(http_requests_total.clone()))?;
601        registry.register(Box::new(http_request_duration.clone()))?;
602
603        Ok(Self {
604            uptime_seconds,
605            memory_usage_bytes,
606            goroutines,
607            http_requests_total,
608            http_request_duration,
609        })
610    }
611
612    pub fn set_uptime(&self, seconds: f64) {
613        self.uptime_seconds.set(seconds);
614    }
615
616    pub fn set_memory_usage(&self, bytes: u64) {
617        self.memory_usage_bytes.set(bytes as f64);
618    }
619
620    pub fn set_goroutines(&self, count: usize) {
621        self.goroutines.set(count as f64);
622    }
623
624    pub fn record_http_request(&self, method: &str, path: &str, status: u16) {
625        self.http_requests_total
626            .with_label_values(&[method, path, &status.to_string()])
627            .inc();
628    }
629
630    pub fn observe_http_duration(&self, method: &str, path: &str, duration: f64) {
631        self.http_request_duration
632            .with_label_values(&[method, path])
633            .observe(duration);
634    }
635}
636
637#[cfg(test)]
638mod tests {
639    use super::*;
640
641    #[test]
642    fn test_metrics_registry_creation() {
643        let registry = MetricsRegistry::new().unwrap();
644        assert!(!registry.gather().is_empty());
645    }
646
647    #[test]
648    fn test_config_metrics() {
649        let registry = MetricsRegistry::new().unwrap();
650
651        registry.config().record_operation("set", "production");
652        registry.config().observe_duration("set", 0.005);
653        registry.config().set_active_configs("test/ns", "production", 42);
654        registry.config().record_error("validation", "set");
655
656        let metrics = registry.gather();
657        assert!(!metrics.is_empty());
658    }
659
660    #[test]
661    fn test_cache_metrics() {
662        let registry = MetricsRegistry::new().unwrap();
663
664        registry.cache().record_hit("l1");
665        registry.cache().record_miss("l1");
666        registry.cache().set_size("l1", 100);
667
668        let hit_rate = registry.cache().hit_rate("l1");
669        assert!((hit_rate - 0.5).abs() < 0.01); // 1 hit, 1 miss = 50%
670    }
671
672    #[test]
673    fn test_rbac_metrics() {
674        let registry = MetricsRegistry::new().unwrap();
675
676        registry.rbac().record_permission_check("config", "read", true);
677        registry.rbac().record_permission_check("config", "write", false);
678        registry.rbac().observe_check_duration("config", 0.0001);
679        registry.rbac().set_active_roles("admin", 5);
680
681        let metrics = registry.gather();
682        assert!(!metrics.is_empty());
683    }
684}