role_system/
telemetry.rs

1//! Telemetry and observability integration for comprehensive monitoring.
2//!
3//! This module provides metrics, tracing, and monitoring capabilities for role system operations.
4//! It includes a working implementation with optional OpenTelemetry integration.
5
6use std::collections::HashMap;
7use std::sync::Arc;
8use std::sync::atomic::{AtomicU64, Ordering};
9use std::time::{Duration, Instant};
10
11#[cfg(feature = "audit")]
12use log::{debug, error, info, warn};
13
14/// Telemetry configuration for the role system.
15#[derive(Debug, Clone)]
16pub struct TelemetryConfig {
17    /// Service name for metrics and tracing
18    pub service_name: String,
19    /// Service version for identification
20    pub service_version: String,
21    /// Enable detailed operation tracking
22    pub detailed_tracking: bool,
23    /// Enable metrics collection
24    pub enable_metrics: bool,
25    /// Enable error tracking
26    pub enable_error_tracking: bool,
27    /// Enable performance tracking
28    pub enable_performance_tracking: bool,
29}
30
31impl Default for TelemetryConfig {
32    fn default() -> Self {
33        Self {
34            service_name: "role-system".to_string(),
35            service_version: env!("CARGO_PKG_VERSION").to_string(),
36            detailed_tracking: true,
37            enable_metrics: true,
38            enable_error_tracking: true,
39            enable_performance_tracking: true,
40        }
41    }
42}
43
44/// Metrics collected by the telemetry system.
45#[derive(Debug, Clone)]
46pub struct TelemetryMetrics {
47    /// Total permission checks performed
48    pub permission_checks_total: u64,
49    /// Permission checks that were granted
50    pub permission_checks_granted: u64,
51    /// Permission checks that were denied
52    pub permission_checks_denied: u64,
53    /// Permission check errors
54    pub permission_check_errors: u64,
55
56    /// Role operations performed
57    pub role_operations_total: u64,
58    /// Role operation errors
59    pub role_operation_errors: u64,
60
61    /// Cache operations
62    pub cache_hits: u64,
63    pub cache_misses: u64,
64
65    /// Performance metrics
66    pub total_operation_time_ms: u64,
67    pub avg_permission_check_time_ms: f64,
68
69    /// Error tracking
70    pub errors_by_type: HashMap<String, u64>,
71}
72
73impl Default for TelemetryMetrics {
74    fn default() -> Self {
75        Self {
76            permission_checks_total: 0,
77            permission_checks_granted: 0,
78            permission_checks_denied: 0,
79            permission_check_errors: 0,
80            role_operations_total: 0,
81            role_operation_errors: 0,
82            cache_hits: 0,
83            cache_misses: 0,
84            total_operation_time_ms: 0,
85            avg_permission_check_time_ms: 0.0,
86            errors_by_type: HashMap::new(),
87        }
88    }
89}
90
91/// Working telemetry provider for the role system.
92pub struct TelemetryProvider {
93    config: TelemetryConfig,
94
95    // Atomic counters for thread-safe metrics
96    permission_checks_total: Arc<AtomicU64>,
97    permission_checks_granted: Arc<AtomicU64>,
98    permission_checks_denied: Arc<AtomicU64>,
99    permission_check_errors: Arc<AtomicU64>,
100
101    role_operations_total: Arc<AtomicU64>,
102    role_operation_errors: Arc<AtomicU64>,
103
104    cache_hits: Arc<AtomicU64>,
105    cache_misses: Arc<AtomicU64>,
106
107    total_operation_time_ms: Arc<AtomicU64>,
108    operation_count: Arc<AtomicU64>,
109
110    start_time: Instant,
111}
112
113impl Default for TelemetryProvider {
114    fn default() -> Self {
115        Self::new()
116    }
117}
118
119impl TelemetryProvider {
120    /// Create a new telemetry provider.
121    pub fn new() -> Self {
122        Self::with_config(TelemetryConfig::default())
123    }
124
125    /// Create a new telemetry provider with custom configuration.
126    pub fn with_config(config: TelemetryConfig) -> Self {
127        #[cfg(feature = "audit")]
128        info!("Initializing telemetry provider: {}", config.service_name);
129
130        Self {
131            config,
132            permission_checks_total: Arc::new(AtomicU64::new(0)),
133            permission_checks_granted: Arc::new(AtomicU64::new(0)),
134            permission_checks_denied: Arc::new(AtomicU64::new(0)),
135            permission_check_errors: Arc::new(AtomicU64::new(0)),
136            role_operations_total: Arc::new(AtomicU64::new(0)),
137            role_operation_errors: Arc::new(AtomicU64::new(0)),
138            cache_hits: Arc::new(AtomicU64::new(0)),
139            cache_misses: Arc::new(AtomicU64::new(0)),
140            total_operation_time_ms: Arc::new(AtomicU64::new(0)),
141            operation_count: Arc::new(AtomicU64::new(0)),
142            start_time: Instant::now(),
143        }
144    }
145
146    /// Record a permission check operation.
147    pub fn record_permission_check(
148        &self,
149        subject: &str,
150        action: &str,
151        resource: &str,
152        granted: bool,
153    ) {
154        if !self.config.enable_metrics {
155            return;
156        }
157
158        self.permission_checks_total.fetch_add(1, Ordering::Relaxed);
159
160        if granted {
161            self.permission_checks_granted
162                .fetch_add(1, Ordering::Relaxed);
163            #[cfg(feature = "audit")]
164            debug!("Permission granted: {} -> {}:{}", subject, action, resource);
165        } else {
166            self.permission_checks_denied
167                .fetch_add(1, Ordering::Relaxed);
168            #[cfg(feature = "audit")]
169            debug!("Permission denied: {} -> {}:{}", subject, action, resource);
170        }
171    }
172
173    /// Record a permission check error.
174    pub fn record_permission_check_error(
175        &self,
176        subject: &str,
177        action: &str,
178        resource: &str,
179        error: &crate::error::Error,
180    ) {
181        if !self.config.enable_error_tracking {
182            return;
183        }
184
185        self.permission_check_errors.fetch_add(1, Ordering::Relaxed);
186
187        #[cfg(feature = "audit")]
188        error!(
189            "Permission check error: {} -> {}:{} - {}",
190            subject, action, resource, error
191        );
192    }
193
194    /// Record a role operation.
195    pub fn record_role_operation(&self, operation: &str, role: &str, success: bool) {
196        if !self.config.enable_metrics {
197            return;
198        }
199
200        self.role_operations_total.fetch_add(1, Ordering::Relaxed);
201
202        if success {
203            #[cfg(feature = "audit")]
204            debug!("Role operation successful: {} on {}", operation, role);
205        } else {
206            self.role_operation_errors.fetch_add(1, Ordering::Relaxed);
207            #[cfg(feature = "audit")]
208            warn!("Role operation failed: {} on {}", operation, role);
209        }
210    }
211
212    /// Record cache operation.
213    pub fn record_cache_operation(&self, hit: bool) {
214        if !self.config.enable_metrics {
215            return;
216        }
217
218        if hit {
219            self.cache_hits.fetch_add(1, Ordering::Relaxed);
220        } else {
221            self.cache_misses.fetch_add(1, Ordering::Relaxed);
222        }
223    }
224
225    /// Record operation duration.
226    pub fn record_operation_duration(&self, _operation: &str, duration: Duration) {
227        if !self.config.enable_performance_tracking {
228            return;
229        }
230
231        self.total_operation_time_ms
232            .fetch_add(duration.as_millis() as u64, Ordering::Relaxed);
233        self.operation_count.fetch_add(1, Ordering::Relaxed);
234    }
235
236    /// Get current metrics snapshot.
237    pub fn get_metrics(&self) -> TelemetryMetrics {
238        let operation_count = self.operation_count.load(Ordering::Relaxed);
239        let total_time = self.total_operation_time_ms.load(Ordering::Relaxed);
240
241        let avg_permission_check_time_ms = if operation_count > 0 {
242            total_time as f64 / operation_count as f64
243        } else {
244            0.0
245        };
246
247        TelemetryMetrics {
248            permission_checks_total: self.permission_checks_total.load(Ordering::Relaxed),
249            permission_checks_granted: self.permission_checks_granted.load(Ordering::Relaxed),
250            permission_checks_denied: self.permission_checks_denied.load(Ordering::Relaxed),
251            permission_check_errors: self.permission_check_errors.load(Ordering::Relaxed),
252            role_operations_total: self.role_operations_total.load(Ordering::Relaxed),
253            role_operation_errors: self.role_operation_errors.load(Ordering::Relaxed),
254            cache_hits: self.cache_hits.load(Ordering::Relaxed),
255            cache_misses: self.cache_misses.load(Ordering::Relaxed),
256            total_operation_time_ms: total_time,
257            avg_permission_check_time_ms,
258            errors_by_type: HashMap::new(), // Could be enhanced with more detailed tracking
259        }
260    }
261
262    /// Get system uptime.
263    pub fn uptime(&self) -> Duration {
264        self.start_time.elapsed()
265    }
266
267    /// Reset all metrics (useful for testing).
268    pub fn reset_metrics(&self) {
269        self.permission_checks_total.store(0, Ordering::Relaxed);
270        self.permission_checks_granted.store(0, Ordering::Relaxed);
271        self.permission_checks_denied.store(0, Ordering::Relaxed);
272        self.permission_check_errors.store(0, Ordering::Relaxed);
273        self.role_operations_total.store(0, Ordering::Relaxed);
274        self.role_operation_errors.store(0, Ordering::Relaxed);
275        self.cache_hits.store(0, Ordering::Relaxed);
276        self.cache_misses.store(0, Ordering::Relaxed);
277        self.total_operation_time_ms.store(0, Ordering::Relaxed);
278        self.operation_count.store(0, Ordering::Relaxed);
279    }
280}
281
282/// Telemetry wrapper for instrumented operations.
283pub struct InstrumentedOperation {
284    start_time: Instant,
285    operation_name: String,
286}
287
288impl InstrumentedOperation {
289    /// Create a new instrumented operation.
290    pub fn new(operation_name: impl Into<String>) -> Self {
291        Self {
292            start_time: Instant::now(),
293            operation_name: operation_name.into(),
294        }
295    }
296
297    /// Add context to the operation (for compatibility, currently logs with audit feature).
298    #[cfg(feature = "audit")]
299    pub fn set_attribute(&mut self, key: &str, value: impl Into<String>) {
300        debug!(
301            "Operation {}: {} = {}",
302            self.operation_name,
303            key,
304            value.into()
305        );
306    }
307
308    /// Add context to the operation (no-op without audit feature).
309    #[cfg(not(feature = "audit"))]
310    pub fn set_attribute(&mut self, _key: &str, _value: impl Into<String>) {
311        // No-op
312    }
313
314    /// Record an error in the operation.
315    #[cfg(feature = "audit")]
316    pub fn record_error(&mut self, error: &dyn std::error::Error) {
317        error!("Operation {} failed: {}", self.operation_name, error);
318    }
319
320    /// Record an error (no-op without audit feature).
321    #[cfg(not(feature = "audit"))]
322    pub fn record_error(&mut self, _error: &dyn std::error::Error) {
323        // No-op
324    }
325
326    /// Get the duration of the operation.
327    pub fn duration(&self) -> Duration {
328        self.start_time.elapsed()
329    }
330
331    /// Finish the operation and return the duration.
332    pub fn finish(self) -> Duration {
333        let duration = self.duration();
334
335        #[cfg(feature = "audit")]
336        debug!(
337            "Operation {} completed in {}ms",
338            self.operation_name,
339            duration.as_millis()
340        );
341
342        duration
343    }
344}
345
346impl Default for InstrumentedOperation {
347    fn default() -> Self {
348        Self::new("unnamed_operation")
349    }
350}
351
352/// Macro for creating instrumented operations.
353#[macro_export]
354macro_rules! instrument {
355    ($telemetry:expr, $operation:expr) => {{
356        $crate::telemetry::InstrumentedOperation::new($operation)
357    }};
358    ($telemetry:expr, $operation:expr, $($attr:expr),*) => {{
359        let mut op = $crate::telemetry::InstrumentedOperation::new($operation);
360        $(
361            op.set_attribute($attr.0, $attr.1);
362        )*
363        op
364    }};
365}
366
367#[cfg(test)]
368mod tests {
369    use super::*;
370
371    #[test]
372    fn test_telemetry_config_default() {
373        let config = TelemetryConfig::default();
374        assert_eq!(config.service_name, "role-system");
375        assert_eq!(config.service_version, env!("CARGO_PKG_VERSION"));
376        assert!(config.detailed_tracking);
377        assert!(config.enable_metrics);
378        assert!(config.enable_error_tracking);
379        assert!(config.enable_performance_tracking);
380    }
381
382    #[test]
383    fn test_instrumented_operation() {
384        let op = InstrumentedOperation::new("test_operation");
385
386        // Simulate some work
387        std::thread::sleep(Duration::from_millis(1));
388
389        let duration = op.finish();
390        assert!(duration.as_millis() >= 1);
391    }
392
393    #[test]
394    fn test_telemetry_provider_creation() {
395        let config = TelemetryConfig::default();
396        let provider = TelemetryProvider::with_config(config);
397
398        // Test that we can record some metrics
399        provider.record_permission_check("alice", "read", "document", true);
400        provider.record_permission_check("bob", "write", "document", false);
401        provider.record_role_operation("assign", "admin", true);
402
403        let metrics = provider.get_metrics();
404        assert_eq!(metrics.permission_checks_total, 2);
405        assert_eq!(metrics.permission_checks_granted, 1);
406        assert_eq!(metrics.permission_checks_denied, 1);
407        assert_eq!(metrics.role_operations_total, 1);
408    }
409
410    #[test]
411    fn test_telemetry_metrics() {
412        let provider = TelemetryProvider::new();
413
414        // Test cache operations
415        provider.record_cache_operation(true); // hit
416        provider.record_cache_operation(false); // miss
417        provider.record_cache_operation(true); // hit
418
419        let metrics = provider.get_metrics();
420        assert_eq!(metrics.cache_hits, 2);
421        assert_eq!(metrics.cache_misses, 1);
422
423        // Test reset
424        provider.reset_metrics();
425        let reset_metrics = provider.get_metrics();
426        assert_eq!(reset_metrics.cache_hits, 0);
427        assert_eq!(reset_metrics.cache_misses, 0);
428    }
429
430    #[test]
431    fn test_operation_duration_tracking() {
432        let provider = TelemetryProvider::new();
433        let duration = Duration::from_millis(100);
434
435        provider.record_operation_duration("test_op", duration);
436
437        let metrics = provider.get_metrics();
438        assert_eq!(metrics.total_operation_time_ms, 100);
439        assert_eq!(metrics.avg_permission_check_time_ms, 100.0);
440    }
441}