auth_framework/monitoring/
mod.rs

1//! Monitoring and Metrics Collection Module
2//!
3//! This module provides comprehensive monitoring capabilities for the authentication framework,
4//! including metrics collection, performance monitoring, security event tracking, and
5//! integration with external monitoring systems.
6//!
7//! # Features
8//!
9//! - **Performance Metrics**: Track authentication performance, latency, and throughput
10//! - **Security Monitoring**: Monitor security events, failed attempts, and anomalies
11//! - **Health Checks**: Provide health status for all authentication components
12//! - **Custom Metrics**: Support for application-specific metrics
13//! - **Integration**: Export metrics to Prometheus, Grafana, DataDog, etc.
14//! - **Alerting**: Configuration-based alerting for critical events
15
16use crate::errors::Result;
17use serde::{Deserialize, Serialize};
18use std::collections::HashMap;
19use std::sync::Arc;
20use std::sync::atomic::{AtomicU64, Ordering};
21use std::time::{Duration, SystemTime, UNIX_EPOCH};
22use tokio::sync::RwLock;
23use tracing::warn;
24
25pub mod alerts;
26pub mod collectors;
27pub mod exporters;
28pub mod health;
29
30/// Monitoring configuration
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct MonitoringConfig {
33    /// Enable monitoring system
34    pub enabled: bool,
35    /// Collection interval in seconds
36    pub collection_interval: u64,
37    /// Maximum metrics history size
38    pub max_history_size: usize,
39    /// Enable performance monitoring
40    pub enable_performance_metrics: bool,
41    /// Enable security monitoring
42    pub enable_security_metrics: bool,
43    /// Enable health checks
44    pub enable_health_checks: bool,
45    /// External monitoring endpoints
46    pub external_endpoints: Vec<String>,
47}
48
49impl Default for MonitoringConfig {
50    fn default() -> Self {
51        Self {
52            enabled: true,
53            collection_interval: 60, // 1 minute
54            max_history_size: 1000,
55            enable_performance_metrics: true,
56            enable_security_metrics: true,
57            enable_health_checks: true,
58            external_endpoints: vec![],
59        }
60    }
61}
62
63/// Metric data point
64#[derive(Debug, Clone, Serialize, Deserialize)]
65pub struct MetricDataPoint {
66    /// Metric name
67    pub name: String,
68    /// Metric value
69    pub value: f64,
70    /// Timestamp
71    pub timestamp: u64,
72    /// Labels/tags
73    pub labels: HashMap<String, String>,
74    /// Metric type
75    pub metric_type: MetricType,
76}
77
78/// Types of metrics
79#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
80pub enum MetricType {
81    /// Counter - monotonically increasing
82    Counter,
83    /// Gauge - can go up and down
84    Gauge,
85    /// Histogram - distribution of values
86    Histogram,
87    /// Summary - like histogram with quantiles
88    Summary,
89}
90
91/// Security event for monitoring
92#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct SecurityEvent {
94    /// Event type
95    pub event_type: SecurityEventType,
96    /// User ID (if applicable)
97    pub user_id: Option<String>,
98    /// IP address
99    pub ip_address: Option<String>,
100    /// Event details
101    pub details: HashMap<String, String>,
102    /// Severity level
103    pub severity: SecurityEventSeverity,
104    /// Timestamp
105    pub timestamp: u64,
106}
107
108/// Security event types
109#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
110pub enum SecurityEventType {
111    /// Failed login attempt
112    FailedLogin,
113    /// Account lockout
114    AccountLockout,
115    /// Privilege escalation
116    PrivilegeEscalation,
117    /// Unusual activity pattern
118    UnusualActivity,
119    /// Token manipulation
120    TokenManipulation,
121    /// Configuration change
122    ConfigurationChange,
123    /// System error
124    SystemError,
125}
126
127/// Security event severity levels
128#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd)]
129pub enum SecurityEventSeverity {
130    /// Low severity
131    Low = 1,
132    /// Medium severity
133    Medium = 2,
134    /// High severity
135    High = 3,
136    /// Critical severity
137    Critical = 4,
138}
139
140/// Performance metrics
141#[derive(Debug, Clone)]
142pub struct PerformanceMetrics {
143    /// Authentication request count
144    pub auth_requests: Arc<AtomicU64>,
145    /// Successful authentications
146    pub auth_successes: Arc<AtomicU64>,
147    /// Failed authentications
148    pub auth_failures: Arc<AtomicU64>,
149    /// Token creation count
150    pub token_creations: Arc<AtomicU64>,
151    /// Token validation count
152    pub token_validations: Arc<AtomicU64>,
153    /// Session count
154    pub active_sessions: Arc<AtomicU64>,
155    /// MFA challenges
156    pub mfa_challenges: Arc<AtomicU64>,
157    /// Average response time (microseconds)
158    pub avg_response_time: Arc<AtomicU64>,
159}
160
161impl Default for PerformanceMetrics {
162    fn default() -> Self {
163        Self {
164            auth_requests: Arc::new(AtomicU64::new(0)),
165            auth_successes: Arc::new(AtomicU64::new(0)),
166            auth_failures: Arc::new(AtomicU64::new(0)),
167            token_creations: Arc::new(AtomicU64::new(0)),
168            token_validations: Arc::new(AtomicU64::new(0)),
169            active_sessions: Arc::new(AtomicU64::new(0)),
170            mfa_challenges: Arc::new(AtomicU64::new(0)),
171            avg_response_time: Arc::new(AtomicU64::new(0)),
172        }
173    }
174}
175
176/// Health check status
177#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
178pub enum HealthStatus {
179    /// All systems operational
180    Healthy,
181    /// Minor issues, still functional
182    Degraded,
183    /// Major issues, limited functionality
184    Unhealthy,
185    /// System down
186    Critical,
187}
188
189/// Health check result
190#[derive(Debug, Clone, Serialize, Deserialize)]
191pub struct HealthCheckResult {
192    /// Component name
193    pub component: String,
194    /// Health status
195    pub status: HealthStatus,
196    /// Status message
197    pub message: String,
198    /// Last check timestamp
199    pub timestamp: u64,
200    /// Response time in milliseconds
201    pub response_time: u64,
202}
203
204/// Main monitoring manager
205pub struct MonitoringManager {
206    /// Configuration
207    config: MonitoringConfig,
208    /// Performance metrics
209    performance: PerformanceMetrics,
210    /// Metric history
211    metrics_history: Arc<RwLock<Vec<MetricDataPoint>>>,
212    /// Security events
213    security_events: Arc<RwLock<Vec<SecurityEvent>>>,
214    /// Health check results
215    health_results: Arc<RwLock<HashMap<String, HealthCheckResult>>>,
216}
217
218impl MonitoringManager {
219    /// Create new monitoring manager
220    pub fn new(config: MonitoringConfig) -> Self {
221        Self {
222            config,
223            performance: PerformanceMetrics::default(),
224            metrics_history: Arc::new(RwLock::new(Vec::new())),
225            security_events: Arc::new(RwLock::new(Vec::new())),
226            health_results: Arc::new(RwLock::new(HashMap::new())),
227        }
228    }
229
230    /// Record authentication request
231    pub async fn record_auth_request(&self) {
232        self.performance
233            .auth_requests
234            .fetch_add(1, Ordering::Relaxed);
235
236        if self.config.enable_performance_metrics {
237            self.record_metric(MetricDataPoint {
238                name: "auth_requests_total".to_string(),
239                value: self.performance.auth_requests.load(Ordering::Relaxed) as f64,
240                timestamp: current_timestamp(),
241                labels: HashMap::new(),
242                metric_type: MetricType::Counter,
243            })
244            .await;
245        }
246    }
247
248    /// Record successful authentication
249    pub async fn record_auth_success(&self, user_id: &str, duration: Duration) {
250        self.performance
251            .auth_successes
252            .fetch_add(1, Ordering::Relaxed);
253        self.update_avg_response_time(duration).await;
254
255        if self.config.enable_performance_metrics {
256            let mut labels = HashMap::new();
257            labels.insert("result".to_string(), "success".to_string());
258            labels.insert("user_id".to_string(), user_id.to_string());
259
260            self.record_metric(MetricDataPoint {
261                name: "auth_attempts_total".to_string(),
262                value: 1.0,
263                timestamp: current_timestamp(),
264                labels,
265                metric_type: MetricType::Counter,
266            })
267            .await;
268        }
269    }
270
271    /// Record failed authentication
272    pub async fn record_auth_failure(&self, user_id: Option<&str>, reason: &str) {
273        self.performance
274            .auth_failures
275            .fetch_add(1, Ordering::Relaxed);
276
277        if self.config.enable_security_metrics {
278            let mut details = HashMap::new();
279            details.insert("reason".to_string(), reason.to_string());
280            if let Some(user) = user_id {
281                details.insert("user_id".to_string(), user.to_string());
282            }
283
284            let security_event = SecurityEvent {
285                event_type: SecurityEventType::FailedLogin,
286                user_id: user_id.map(|s| s.to_string()),
287                ip_address: None, // Would be populated from request context
288                details,
289                severity: SecurityEventSeverity::Medium,
290                timestamp: current_timestamp(),
291            };
292
293            self.record_security_event(security_event).await;
294        }
295
296        if self.config.enable_performance_metrics {
297            let mut labels = HashMap::new();
298            labels.insert("result".to_string(), "failure".to_string());
299            labels.insert("reason".to_string(), reason.to_string());
300
301            self.record_metric(MetricDataPoint {
302                name: "auth_attempts_total".to_string(),
303                value: 1.0,
304                timestamp: current_timestamp(),
305                labels,
306                metric_type: MetricType::Counter,
307            })
308            .await;
309        }
310    }
311
312    /// Record token creation
313    pub async fn record_token_creation(&self, token_type: &str) {
314        self.performance
315            .token_creations
316            .fetch_add(1, Ordering::Relaxed);
317
318        if self.config.enable_performance_metrics {
319            let mut labels = HashMap::new();
320            labels.insert("token_type".to_string(), token_type.to_string());
321
322            self.record_metric(MetricDataPoint {
323                name: "tokens_created_total".to_string(),
324                value: 1.0,
325                timestamp: current_timestamp(),
326                labels,
327                metric_type: MetricType::Counter,
328            })
329            .await;
330        }
331    }
332
333    /// Record token validation
334    pub async fn record_token_validation(&self, valid: bool) {
335        self.performance
336            .token_validations
337            .fetch_add(1, Ordering::Relaxed);
338
339        if self.config.enable_performance_metrics {
340            let mut labels = HashMap::new();
341            labels.insert(
342                "result".to_string(),
343                if valid { "valid" } else { "invalid" }.to_string(),
344            );
345
346            self.record_metric(MetricDataPoint {
347                name: "tokens_validated_total".to_string(),
348                value: 1.0,
349                timestamp: current_timestamp(),
350                labels,
351                metric_type: MetricType::Counter,
352            })
353            .await;
354        }
355    }
356
357    /// Update session count
358    pub async fn update_session_count(&self, count: u64) {
359        self.performance
360            .active_sessions
361            .store(count, Ordering::Relaxed);
362
363        if self.config.enable_performance_metrics {
364            self.record_metric(MetricDataPoint {
365                name: "active_sessions".to_string(),
366                value: count as f64,
367                timestamp: current_timestamp(),
368                labels: HashMap::new(),
369                metric_type: MetricType::Gauge,
370            })
371            .await;
372        }
373    }
374
375    /// Record MFA challenge
376    pub async fn record_mfa_challenge(&self, method: &str) {
377        self.performance
378            .mfa_challenges
379            .fetch_add(1, Ordering::Relaxed);
380
381        if self.config.enable_performance_metrics {
382            let mut labels = HashMap::new();
383            labels.insert("method".to_string(), method.to_string());
384
385            self.record_metric(MetricDataPoint {
386                name: "mfa_challenges_total".to_string(),
387                value: 1.0,
388                timestamp: current_timestamp(),
389                labels,
390                metric_type: MetricType::Counter,
391            })
392            .await;
393        }
394    }
395
396    /// Record security event
397    pub async fn record_security_event(&self, event: SecurityEvent) {
398        if !self.config.enable_security_metrics {
399            return;
400        }
401
402        let mut events = self.security_events.write().await;
403        events.push(event.clone());
404
405        // Keep only recent events
406        if events.len() > self.config.max_history_size {
407            events.remove(0);
408        }
409
410        tracing::warn!(
411            "Security event: {:?} - User: {:?}, Severity: {:?}",
412            event.event_type,
413            event.user_id,
414            event.severity
415        );
416
417        // Alert on critical events
418        if event.severity == SecurityEventSeverity::Critical {
419            // Would trigger external alerting system
420            tracing::error!("CRITICAL security event: {:?}", event);
421        }
422    }
423
424    /// Record generic metric
425    async fn record_metric(&self, metric: MetricDataPoint) {
426        if !self.config.enabled {
427            return;
428        }
429
430        let mut metrics = self.metrics_history.write().await;
431        metrics.push(metric);
432
433        // Keep history size manageable
434        if metrics.len() > self.config.max_history_size {
435            metrics.remove(0);
436        }
437    }
438
439    /// Update average response time
440    async fn update_avg_response_time(&self, duration: Duration) {
441        let current_avg = self.performance.avg_response_time.load(Ordering::Relaxed);
442        let new_time = duration.as_micros() as u64;
443
444        // Simple moving average
445        let updated_avg = if current_avg == 0 {
446            new_time
447        } else {
448            (current_avg + new_time) / 2
449        };
450
451        self.performance
452            .avg_response_time
453            .store(updated_avg, Ordering::Relaxed);
454    }
455
456    /// Get current performance metrics
457    pub fn get_performance_metrics(&self) -> HashMap<String, u64> {
458        let mut metrics = HashMap::new();
459        metrics.insert(
460            "auth_requests".to_string(),
461            self.performance.auth_requests.load(Ordering::Relaxed),
462        );
463        metrics.insert(
464            "auth_successes".to_string(),
465            self.performance.auth_successes.load(Ordering::Relaxed),
466        );
467        metrics.insert(
468            "auth_failures".to_string(),
469            self.performance.auth_failures.load(Ordering::Relaxed),
470        );
471        metrics.insert(
472            "token_creations".to_string(),
473            self.performance.token_creations.load(Ordering::Relaxed),
474        );
475        metrics.insert(
476            "token_validations".to_string(),
477            self.performance.token_validations.load(Ordering::Relaxed),
478        );
479        metrics.insert(
480            "active_sessions".to_string(),
481            self.performance.active_sessions.load(Ordering::Relaxed),
482        );
483        metrics.insert(
484            "mfa_challenges".to_string(),
485            self.performance.mfa_challenges.load(Ordering::Relaxed),
486        );
487        metrics.insert(
488            "avg_response_time_us".to_string(),
489            self.performance.avg_response_time.load(Ordering::Relaxed),
490        );
491        metrics
492    }
493
494    /// Get security events
495    pub async fn get_security_events(&self, limit: Option<usize>) -> Vec<SecurityEvent> {
496        let events = self.security_events.read().await;
497        let limit = limit.unwrap_or(100);
498
499        if events.len() <= limit {
500            events.clone()
501        } else {
502            events.iter().rev().take(limit).cloned().collect()
503        }
504    }
505
506    /// Get metrics history
507    pub async fn get_metrics_history(&self, metric_name: Option<&str>) -> Vec<MetricDataPoint> {
508        let metrics = self.metrics_history.read().await;
509
510        if let Some(name) = metric_name {
511            metrics.iter().filter(|m| m.name == name).cloned().collect()
512        } else {
513            metrics.clone()
514        }
515    }
516
517    /// Perform health check
518    pub async fn health_check(&self) -> Result<HashMap<String, HealthCheckResult>> {
519        if !self.config.enable_health_checks {
520            return Ok(HashMap::new());
521        }
522
523        let mut results = HashMap::new();
524        let start_time = SystemTime::now();
525
526        // Check authentication system health
527        let auth_health = self.check_auth_health().await;
528        results.insert("authentication".to_string(), auth_health);
529
530        // Check storage health
531        let storage_health = self.check_storage_health().await;
532        results.insert("storage".to_string(), storage_health);
533
534        // Check token system health
535        let token_health = self.check_token_health().await;
536        results.insert("tokens".to_string(), token_health);
537
538        // Update health results cache
539        let mut health_cache = self.health_results.write().await;
540        for (component, result) in &results {
541            health_cache.insert(component.clone(), result.clone());
542        }
543
544        let elapsed = start_time.elapsed().unwrap_or_default();
545        tracing::debug!("Health check completed in {:?}", elapsed);
546
547        Ok(results)
548    }
549
550    /// Check authentication system health
551    async fn check_auth_health(&self) -> HealthCheckResult {
552        let start_time = SystemTime::now();
553
554        // Check basic metrics
555        let auth_requests = self.performance.auth_requests.load(Ordering::Relaxed);
556        let auth_failures = self.performance.auth_failures.load(Ordering::Relaxed);
557
558        let status = if auth_requests > 0 {
559            let failure_rate = (auth_failures as f64) / (auth_requests as f64);
560            if failure_rate > 0.5 {
561                HealthStatus::Unhealthy
562            } else if failure_rate > 0.2 {
563                HealthStatus::Degraded
564            } else {
565                HealthStatus::Healthy
566            }
567        } else {
568            HealthStatus::Healthy
569        };
570
571        let message = match status {
572            HealthStatus::Healthy => "Authentication system operating normally".to_string(),
573            HealthStatus::Degraded => format!(
574                "High failure rate: {:.1}%",
575                (auth_failures as f64 / auth_requests as f64) * 100.0
576            ),
577            HealthStatus::Unhealthy => format!(
578                "Critical failure rate: {:.1}%",
579                (auth_failures as f64 / auth_requests as f64) * 100.0
580            ),
581            HealthStatus::Critical => "Authentication system down".to_string(),
582        };
583
584        HealthCheckResult {
585            component: "authentication".to_string(),
586            status,
587            message,
588            timestamp: current_timestamp(),
589            response_time: start_time.elapsed().unwrap_or_default().as_millis() as u64,
590        }
591    }
592
593    /// Check storage system health
594    async fn check_storage_health(&self) -> HealthCheckResult {
595        let start_time = SystemTime::now();
596
597        // Test storage connectivity with a simple operation
598        let status = match self.test_storage_connectivity().await {
599            Ok(response_time_ms) => {
600                if response_time_ms > 5000 {
601                    HealthStatus::Degraded
602                } else {
603                    HealthStatus::Healthy
604                }
605            }
606            Err(e) => {
607                warn!("Storage health check failed: {}", e);
608                HealthStatus::Critical
609            }
610        };
611
612        let message = match status {
613            HealthStatus::Healthy => "Storage system operational".to_string(),
614            HealthStatus::Degraded => "Storage system slow but operational".to_string(),
615            HealthStatus::Critical => "Storage system connectivity failed".to_string(),
616            HealthStatus::Unhealthy => "Storage system unhealthy".to_string(),
617        };
618
619        HealthCheckResult {
620            component: "storage".to_string(),
621            status,
622            message,
623            timestamp: current_timestamp(),
624            response_time: start_time.elapsed().unwrap_or_default().as_millis() as u64,
625        }
626    }
627
628    /// Test storage connectivity with a lightweight operation
629    async fn test_storage_connectivity(
630        &self,
631    ) -> Result<u64, Box<dyn std::error::Error + Send + Sync>> {
632        let start_time = SystemTime::now();
633
634        // Attempt a simple storage operation to test connectivity
635        // This is a lightweight test that doesn't affect application data
636        match tokio::time::timeout(
637            std::time::Duration::from_secs(5),
638            self.attempt_storage_ping(),
639        )
640        .await
641        {
642            Ok(result) => {
643                result?;
644                let response_time = start_time.elapsed()?.as_millis() as u64;
645                Ok(response_time)
646            }
647            Err(_) => Err("Storage connectivity test timed out".into()),
648        }
649    }
650
651    /// Attempt to ping the storage system
652    async fn attempt_storage_ping(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
653        // Try to perform a simple read operation that should always work
654        // This tests database/storage connectivity without side effects
655
656        // For now, we'll simulate a basic connectivity test
657        // In a full implementation, this would:
658        // 1. Test database connection pool
659        // 2. Execute a simple SELECT 1 query
660        // 3. Test Redis connectivity if Redis is configured
661        // 4. Verify storage backend is responsive
662
663        // Placeholder: simulate a storage ping
664        tokio::time::sleep(std::time::Duration::from_millis(10)).await;
665
666        // PRODUCTION: Storage-specific ping tests available:
667        // - PostgreSQL: SELECT 1
668        // - Redis: PING command
669        // - MySQL: SELECT 1
670        // - Memory: verify internal structures
671
672        Ok(())
673    }
674
675    /// Check token system health
676    async fn check_token_health(&self) -> HealthCheckResult {
677        let start_time = SystemTime::now();
678
679        let token_validations = self.performance.token_validations.load(Ordering::Relaxed);
680
681        HealthCheckResult {
682            component: "tokens".to_string(),
683            status: HealthStatus::Healthy,
684            message: format!(
685                "Token system operational - {} validations",
686                token_validations
687            ),
688            timestamp: current_timestamp(),
689            response_time: start_time.elapsed().unwrap_or_default().as_millis() as u64,
690        }
691    }
692
693    /// Export metrics in Prometheus format
694    pub async fn export_prometheus_metrics(&self) -> String {
695        let mut output = String::new();
696
697        let metrics = self.get_performance_metrics();
698
699        for (name, value) in metrics {
700            output.push_str(&format!(
701                "# HELP auth_{} Authentication framework metric\n",
702                name
703            ));
704            output.push_str(&format!("# TYPE auth_{} counter\n", name));
705            output.push_str(&format!("auth_{} {}\n", name, value));
706        }
707
708        output
709    }
710}
711
712/// Get current Unix timestamp
713fn current_timestamp() -> u64 {
714    SystemTime::now()
715        .duration_since(UNIX_EPOCH)
716        .unwrap_or_default()
717        .as_secs()
718}
719
720#[cfg(test)]
721mod tests {
722    use super::*;
723    use tokio;
724
725    #[tokio::test]
726    async fn test_monitoring_manager_creation() {
727        let config = MonitoringConfig::default();
728        let manager = MonitoringManager::new(config);
729
730        let metrics = manager.get_performance_metrics();
731        assert_eq!(metrics["auth_requests"], 0);
732    }
733
734    #[tokio::test]
735    async fn test_auth_request_recording() {
736        let config = MonitoringConfig::default();
737        let manager = MonitoringManager::new(config);
738
739        manager.record_auth_request().await;
740        manager.record_auth_request().await;
741
742        let metrics = manager.get_performance_metrics();
743        assert_eq!(metrics["auth_requests"], 2);
744    }
745
746    #[tokio::test]
747    async fn test_security_event_recording() {
748        let config = MonitoringConfig::default();
749        let manager = MonitoringManager::new(config);
750
751        let event = SecurityEvent {
752            event_type: SecurityEventType::FailedLogin,
753            user_id: Some("test_user".to_string()),
754            ip_address: Some("127.0.0.1".to_string()),
755            details: HashMap::new(),
756            severity: SecurityEventSeverity::Medium,
757            timestamp: current_timestamp(),
758        };
759
760        manager.record_security_event(event).await;
761
762        let events = manager.get_security_events(None).await;
763        assert_eq!(events.len(), 1);
764        assert_eq!(events[0].event_type, SecurityEventType::FailedLogin);
765    }
766
767    #[tokio::test]
768    async fn test_health_check() {
769        let config = MonitoringConfig::default();
770        let manager = MonitoringManager::new(config);
771
772        let health_results = manager.health_check().await.unwrap();
773
774        assert!(health_results.contains_key("authentication"));
775        assert!(health_results.contains_key("storage"));
776        assert!(health_results.contains_key("tokens"));
777    }
778
779    #[tokio::test]
780    async fn test_prometheus_export() {
781        let config = MonitoringConfig::default();
782        let manager = MonitoringManager::new(config);
783
784        manager.record_auth_request().await;
785
786        let prometheus_output = manager.export_prometheus_metrics().await;
787
788        assert!(prometheus_output.contains("auth_auth_requests"));
789        assert!(prometheus_output.contains("# HELP"));
790        assert!(prometheus_output.contains("# TYPE"));
791    }
792}