1use crate::errors::Result;
17use serde::{Deserialize, Serialize};
18use std::collections::HashMap;
19use std::sync::Arc;
20use std::sync::atomic::{AtomicU64, Ordering};
21use std::time::{Duration, SystemTime, UNIX_EPOCH};
22use tokio::sync::RwLock;
23use tracing::warn;
24
25pub mod alerts;
26pub mod collectors;
27pub mod exporters;
28pub mod health;
29
30pub use collectors::{
32 AUTH_FAILED_REQUESTS, AUTH_SUCCESSFUL_REQUESTS, AUTH_TOTAL_REQUESTS, SESSION_ACTIVE_COUNT,
33 SESSION_CREATED_TOTAL, SESSION_EXPIRED_COUNT, TOKEN_CREATION_COUNT, TOKEN_EXPIRATION_COUNT,
34 TOKEN_VALIDATION_COUNT,
35};
36
37#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct MonitoringConfig {
40 pub enabled: bool,
42 pub collection_interval: u64,
44 pub max_history_size: usize,
46 pub enable_performance_metrics: bool,
48 pub enable_security_metrics: bool,
50 pub enable_health_checks: bool,
52 pub external_endpoints: Vec<String>,
54}
55
56impl Default for MonitoringConfig {
57 fn default() -> Self {
58 Self {
59 enabled: true,
60 collection_interval: 60, max_history_size: 1000,
62 enable_performance_metrics: true,
63 enable_security_metrics: true,
64 enable_health_checks: true,
65 external_endpoints: vec![],
66 }
67 }
68}
69
70#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct MetricDataPoint {
73 pub name: String,
75 pub value: f64,
77 pub timestamp: u64,
79 pub labels: HashMap<String, String>,
81 pub metric_type: MetricType,
83}
84
85#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
87pub enum MetricType {
88 Counter,
90 Gauge,
92 Histogram,
94 Summary,
96}
97
98#[derive(Debug, Clone, Serialize, Deserialize)]
100pub struct SecurityEvent {
101 pub event_type: SecurityEventType,
103 pub user_id: Option<String>,
105 pub ip_address: Option<String>,
107 pub details: HashMap<String, String>,
109 pub severity: SecurityEventSeverity,
111 pub timestamp: u64,
113}
114
115impl SecurityEvent {
116 pub fn builder(
130 event_type: SecurityEventType,
131 severity: SecurityEventSeverity,
132 ) -> SecurityEventBuilder {
133 SecurityEventBuilder {
134 event_type,
135 severity,
136 user_id: None,
137 ip_address: None,
138 details: HashMap::new(),
139 }
140 }
141}
142
143pub struct SecurityEventBuilder {
147 event_type: SecurityEventType,
148 severity: SecurityEventSeverity,
149 user_id: Option<String>,
150 ip_address: Option<String>,
151 details: HashMap<String, String>,
152}
153
154impl SecurityEventBuilder {
155 pub fn user(mut self, user_id: impl Into<String>) -> Self {
157 self.user_id = Some(user_id.into());
158 self
159 }
160
161 pub fn ip(mut self, ip_address: impl Into<String>) -> Self {
163 self.ip_address = Some(ip_address.into());
164 self
165 }
166
167 pub fn detail(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
169 self.details.insert(key.into(), value.into());
170 self
171 }
172
173 pub fn details(
175 mut self,
176 iter: impl IntoIterator<Item = (impl Into<String>, impl Into<String>)>,
177 ) -> Self {
178 for (k, v) in iter {
179 self.details.insert(k.into(), v.into());
180 }
181 self
182 }
183
184 pub fn build(self) -> SecurityEvent {
186 SecurityEvent {
187 event_type: self.event_type,
188 user_id: self.user_id,
189 ip_address: self.ip_address,
190 details: self.details,
191 severity: self.severity,
192 timestamp: SystemTime::now()
193 .duration_since(UNIX_EPOCH)
194 .unwrap_or_default()
195 .as_secs(),
196 }
197 }
198}
199
200#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
202pub enum SecurityEventType {
203 FailedLogin,
205 AccountLockout,
207 PrivilegeEscalation,
209 UnusualActivity,
211 TokenManipulation,
213 ConfigurationChange,
215 SystemError,
217}
218
219#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd)]
221pub enum SecurityEventSeverity {
222 Low = 1,
224 Medium = 2,
226 High = 3,
228 Critical = 4,
230}
231
232#[derive(Debug, Clone)]
234pub struct PerformanceMetrics {
235 pub auth_requests: Arc<AtomicU64>,
237 pub auth_successes: Arc<AtomicU64>,
239 pub auth_failures: Arc<AtomicU64>,
241 pub token_creations: Arc<AtomicU64>,
243 pub token_validations: Arc<AtomicU64>,
245 pub active_sessions: Arc<AtomicU64>,
247 pub mfa_challenges: Arc<AtomicU64>,
249 pub avg_response_time: Arc<AtomicU64>,
251}
252
253impl Default for PerformanceMetrics {
254 fn default() -> Self {
255 Self {
256 auth_requests: Arc::new(AtomicU64::new(0)),
257 auth_successes: Arc::new(AtomicU64::new(0)),
258 auth_failures: Arc::new(AtomicU64::new(0)),
259 token_creations: Arc::new(AtomicU64::new(0)),
260 token_validations: Arc::new(AtomicU64::new(0)),
261 active_sessions: Arc::new(AtomicU64::new(0)),
262 mfa_challenges: Arc::new(AtomicU64::new(0)),
263 avg_response_time: Arc::new(AtomicU64::new(0)),
264 }
265 }
266}
267
268#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
270pub enum HealthStatus {
271 Healthy,
273 Degraded,
275 Unhealthy,
277 Critical,
279}
280
281#[derive(Debug, Clone, Serialize, Deserialize)]
283pub struct HealthCheckResult {
284 pub component: String,
286 pub status: HealthStatus,
288 pub message: String,
290 pub timestamp: u64,
292 pub response_time: u64,
294}
295
296pub struct MonitoringManager {
298 config: MonitoringConfig,
300 performance: PerformanceMetrics,
302 metrics_history: Arc<RwLock<Vec<MetricDataPoint>>>,
304 security_events: Arc<RwLock<Vec<SecurityEvent>>>,
306 health_results: Arc<RwLock<HashMap<String, HealthCheckResult>>>,
308}
309
310impl MonitoringManager {
311 pub fn new(config: MonitoringConfig) -> Self {
313 Self {
314 config,
315 performance: PerformanceMetrics::default(),
316 metrics_history: Arc::new(RwLock::new(Vec::new())),
317 security_events: Arc::new(RwLock::new(Vec::new())),
318 health_results: Arc::new(RwLock::new(HashMap::new())),
319 }
320 }
321
322 pub async fn record_auth_request(&self) {
324 self.performance
325 .auth_requests
326 .fetch_add(1, Ordering::Relaxed);
327
328 if self.config.enable_performance_metrics {
329 self.record_metric(MetricDataPoint {
330 name: "auth_requests_total".to_string(),
331 value: self.performance.auth_requests.load(Ordering::Relaxed) as f64,
332 timestamp: current_timestamp(),
333 labels: HashMap::new(),
334 metric_type: MetricType::Counter,
335 })
336 .await;
337 }
338 }
339
340 pub async fn record_auth_success(&self, user_id: &str, duration: Duration) {
342 self.performance
343 .auth_successes
344 .fetch_add(1, Ordering::Relaxed);
345 self.update_avg_response_time(duration).await;
346
347 if self.config.enable_performance_metrics {
348 let mut labels = HashMap::new();
349 labels.insert("result".to_string(), "success".to_string());
350 labels.insert("user_id".to_string(), user_id.to_string());
351
352 self.record_metric(MetricDataPoint {
353 name: "auth_attempts_total".to_string(),
354 value: 1.0,
355 timestamp: current_timestamp(),
356 labels,
357 metric_type: MetricType::Counter,
358 })
359 .await;
360 }
361 }
362
363 pub async fn record_auth_failure(&self, user_id: Option<&str>, reason: &str) {
365 self.performance
366 .auth_failures
367 .fetch_add(1, Ordering::Relaxed);
368
369 if self.config.enable_security_metrics {
370 let mut details = HashMap::new();
371 details.insert("reason".to_string(), reason.to_string());
372 if let Some(user) = user_id {
373 details.insert("user_id".to_string(), user.to_string());
374 }
375
376 let security_event = SecurityEvent {
377 event_type: SecurityEventType::FailedLogin,
378 user_id: user_id.map(|s| s.to_string()),
379 ip_address: None, details,
381 severity: SecurityEventSeverity::Medium,
382 timestamp: current_timestamp(),
383 };
384
385 self.record_security_event(security_event).await;
386 }
387
388 if self.config.enable_performance_metrics {
389 let mut labels = HashMap::new();
390 labels.insert("result".to_string(), "failure".to_string());
391 labels.insert("reason".to_string(), reason.to_string());
392
393 self.record_metric(MetricDataPoint {
394 name: "auth_attempts_total".to_string(),
395 value: 1.0,
396 timestamp: current_timestamp(),
397 labels,
398 metric_type: MetricType::Counter,
399 })
400 .await;
401 }
402 }
403
404 pub async fn record_token_creation(&self, token_type: &str) {
406 self.performance
407 .token_creations
408 .fetch_add(1, Ordering::Relaxed);
409
410 if self.config.enable_performance_metrics {
411 let mut labels = HashMap::new();
412 labels.insert("token_type".to_string(), token_type.to_string());
413
414 self.record_metric(MetricDataPoint {
415 name: "tokens_created_total".to_string(),
416 value: 1.0,
417 timestamp: current_timestamp(),
418 labels,
419 metric_type: MetricType::Counter,
420 })
421 .await;
422 }
423 }
424
425 pub async fn record_token_validation(&self, valid: bool) {
427 self.performance
428 .token_validations
429 .fetch_add(1, Ordering::Relaxed);
430
431 if self.config.enable_performance_metrics {
432 let mut labels = HashMap::new();
433 labels.insert(
434 "result".to_string(),
435 if valid { "valid" } else { "invalid" }.to_string(),
436 );
437
438 self.record_metric(MetricDataPoint {
439 name: "tokens_validated_total".to_string(),
440 value: 1.0,
441 timestamp: current_timestamp(),
442 labels,
443 metric_type: MetricType::Counter,
444 })
445 .await;
446 }
447 }
448
449 pub async fn update_session_count(&self, count: u64) {
451 self.performance
452 .active_sessions
453 .store(count, Ordering::Relaxed);
454
455 if self.config.enable_performance_metrics {
456 self.record_metric(MetricDataPoint {
457 name: "active_sessions".to_string(),
458 value: count as f64,
459 timestamp: current_timestamp(),
460 labels: HashMap::new(),
461 metric_type: MetricType::Gauge,
462 })
463 .await;
464 }
465 }
466
467 pub async fn record_mfa_challenge(&self, method: &str) {
469 self.performance
470 .mfa_challenges
471 .fetch_add(1, Ordering::Relaxed);
472
473 if self.config.enable_performance_metrics {
474 let mut labels = HashMap::new();
475 labels.insert("method".to_string(), method.to_string());
476
477 self.record_metric(MetricDataPoint {
478 name: "mfa_challenges_total".to_string(),
479 value: 1.0,
480 timestamp: current_timestamp(),
481 labels,
482 metric_type: MetricType::Counter,
483 })
484 .await;
485 }
486 }
487
488 pub async fn record_security_event(&self, event: SecurityEvent) {
490 if !self.config.enable_security_metrics {
491 return;
492 }
493
494 let mut events = self.security_events.write().await;
495 events.push(event.clone());
496
497 if events.len() > self.config.max_history_size {
499 events.remove(0);
500 }
501
502 tracing::warn!(
503 "Security event: {:?} - User: {:?}, Severity: {:?}",
504 event.event_type,
505 event.user_id,
506 event.severity
507 );
508
509 if event.severity == SecurityEventSeverity::Critical {
511 tracing::error!("CRITICAL security event: {:?}", event);
513 }
514 }
515
516 async fn record_metric(&self, metric: MetricDataPoint) {
518 if !self.config.enabled {
519 return;
520 }
521
522 let mut metrics = self.metrics_history.write().await;
523 metrics.push(metric);
524
525 if metrics.len() > self.config.max_history_size {
527 metrics.remove(0);
528 }
529 }
530
531 async fn update_avg_response_time(&self, duration: Duration) {
533 let current_avg = self.performance.avg_response_time.load(Ordering::Relaxed);
534 let new_time = duration.as_micros() as u64;
535
536 let updated_avg = if current_avg == 0 {
538 new_time
539 } else {
540 (current_avg + new_time) / 2
541 };
542
543 self.performance
544 .avg_response_time
545 .store(updated_avg, Ordering::Relaxed);
546 }
547
548 pub fn get_performance_metrics(&self) -> HashMap<String, u64> {
550 let mut metrics = HashMap::new();
551 metrics.insert(
552 "auth_requests".to_string(),
553 self.performance.auth_requests.load(Ordering::Relaxed),
554 );
555 metrics.insert(
556 "auth_successes".to_string(),
557 self.performance.auth_successes.load(Ordering::Relaxed),
558 );
559 metrics.insert(
560 "auth_failures".to_string(),
561 self.performance.auth_failures.load(Ordering::Relaxed),
562 );
563 metrics.insert(
564 "token_creations".to_string(),
565 self.performance.token_creations.load(Ordering::Relaxed),
566 );
567 metrics.insert(
568 "token_validations".to_string(),
569 self.performance.token_validations.load(Ordering::Relaxed),
570 );
571 metrics.insert(
572 "active_sessions".to_string(),
573 self.performance.active_sessions.load(Ordering::Relaxed),
574 );
575 metrics.insert(
576 "mfa_challenges".to_string(),
577 self.performance.mfa_challenges.load(Ordering::Relaxed),
578 );
579 metrics.insert(
580 "avg_response_time_us".to_string(),
581 self.performance.avg_response_time.load(Ordering::Relaxed),
582 );
583 metrics
584 }
585
586 pub async fn get_security_events(&self, limit: Option<usize>) -> Vec<SecurityEvent> {
588 let events = self.security_events.read().await;
589 let limit = limit.unwrap_or(100);
590
591 if events.len() <= limit {
592 events.clone()
593 } else {
594 events.iter().rev().take(limit).cloned().collect()
595 }
596 }
597
598 pub async fn get_metrics_history(&self, metric_name: Option<&str>) -> Vec<MetricDataPoint> {
600 let metrics = self.metrics_history.read().await;
601
602 if let Some(name) = metric_name {
603 metrics.iter().filter(|m| m.name == name).cloned().collect()
604 } else {
605 metrics.clone()
606 }
607 }
608
609 pub async fn health_check(&self) -> Result<HashMap<String, HealthCheckResult>> {
611 if !self.config.enable_health_checks {
612 let mut results = HashMap::new();
613 let timestamp = SystemTime::now()
614 .duration_since(UNIX_EPOCH)
615 .unwrap_or_default()
616 .as_secs();
617 results.insert(
618 "monitoring".to_string(),
619 HealthCheckResult {
620 component: "monitoring".to_string(),
621 status: HealthStatus::Healthy,
622 message: "Health checks disabled; monitoring subsystem not active"
623 .to_string(),
624 timestamp,
625 response_time: 0,
626 },
627 );
628 return Ok(results);
629 }
630
631 let mut results = HashMap::new();
632 let start_time = SystemTime::now();
633
634 let auth_health = self.check_auth_health().await;
636 results.insert("authentication".to_string(), auth_health);
637
638 let storage_health = self.check_storage_health().await;
640 results.insert("storage".to_string(), storage_health);
641
642 let token_health = self.check_token_health().await;
644 results.insert("tokens".to_string(), token_health);
645
646 let mut health_cache = self.health_results.write().await;
648 for (component, result) in &results {
649 health_cache.insert(component.clone(), result.clone());
650 }
651
652 let elapsed = start_time.elapsed().unwrap_or_default();
653 tracing::debug!("Health check completed in {:?}", elapsed);
654
655 Ok(results)
656 }
657
658 async fn check_auth_health(&self) -> HealthCheckResult {
660 let start_time = SystemTime::now();
661
662 let auth_requests = self.performance.auth_requests.load(Ordering::Relaxed);
664 let auth_failures = self.performance.auth_failures.load(Ordering::Relaxed);
665
666 let status = if auth_requests > 0 {
667 let failure_rate = (auth_failures as f64) / (auth_requests as f64);
668 if failure_rate > 0.5 {
669 HealthStatus::Unhealthy
670 } else if failure_rate > 0.2 {
671 HealthStatus::Degraded
672 } else {
673 HealthStatus::Healthy
674 }
675 } else {
676 HealthStatus::Healthy
677 };
678
679 let message = match status {
680 HealthStatus::Healthy => "Authentication system operating normally".to_string(),
681 HealthStatus::Degraded => format!(
682 "High failure rate: {:.1}%",
683 (auth_failures as f64 / auth_requests as f64) * 100.0
684 ),
685 HealthStatus::Unhealthy => format!(
686 "Critical failure rate: {:.1}%",
687 (auth_failures as f64 / auth_requests as f64) * 100.0
688 ),
689 HealthStatus::Critical => "Authentication system down".to_string(),
690 };
691
692 HealthCheckResult {
693 component: "authentication".to_string(),
694 status,
695 message,
696 timestamp: current_timestamp(),
697 response_time: start_time.elapsed().unwrap_or_default().as_millis() as u64,
698 }
699 }
700
701 async fn check_storage_health(&self) -> HealthCheckResult {
703 let start_time = SystemTime::now();
704
705 let status = match self.test_storage_connectivity().await {
707 Ok(response_time_ms) => {
708 if response_time_ms > 5000 {
709 HealthStatus::Degraded
710 } else {
711 HealthStatus::Healthy
712 }
713 }
714 Err(e) => {
715 warn!("Storage health check failed: {}", e);
716 HealthStatus::Critical
717 }
718 };
719
720 let message = match status {
721 HealthStatus::Healthy => "Storage system operational".to_string(),
722 HealthStatus::Degraded => "Storage system slow but operational".to_string(),
723 HealthStatus::Critical => "Storage system connectivity failed".to_string(),
724 HealthStatus::Unhealthy => "Storage system unhealthy".to_string(),
725 };
726
727 HealthCheckResult {
728 component: "storage".to_string(),
729 status,
730 message,
731 timestamp: current_timestamp(),
732 response_time: start_time.elapsed().unwrap_or_default().as_millis() as u64,
733 }
734 }
735
736 async fn test_storage_connectivity(
738 &self,
739 ) -> Result<u64, Box<dyn std::error::Error + Send + Sync>> {
740 let start_time = SystemTime::now();
741
742 match tokio::time::timeout(
745 std::time::Duration::from_secs(5),
746 self.attempt_storage_ping(),
747 )
748 .await
749 {
750 Ok(result) => {
751 result?;
752 let response_time = start_time.elapsed()?.as_millis() as u64;
753 Ok(response_time)
754 }
755 Err(_) => Err("Storage connectivity test timed out".into()),
756 }
757 }
758
759 async fn attempt_storage_ping(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
761 let _history = self.metrics_history.read().await;
764 let _health = self.health_results.read().await;
765 Ok(())
766 }
767
768 async fn check_token_health(&self) -> HealthCheckResult {
770 let start_time = SystemTime::now();
771
772 let token_validations = self.performance.token_validations.load(Ordering::Relaxed);
773
774 HealthCheckResult {
775 component: "tokens".to_string(),
776 status: HealthStatus::Healthy,
777 message: format!(
778 "Token system operational - {} validations",
779 token_validations
780 ),
781 timestamp: current_timestamp(),
782 response_time: start_time.elapsed().unwrap_or_default().as_millis() as u64,
783 }
784 }
785
786 pub async fn export_prometheus_metrics(&self) -> String {
788 let mut output = String::new();
789
790 let metrics = self.get_performance_metrics();
791
792 for (name, value) in metrics {
793 output.push_str(&format!(
794 "# HELP auth_{} Authentication framework metric\n",
795 name
796 ));
797 output.push_str(&format!("# TYPE auth_{} counter\n", name));
798 output.push_str(&format!("auth_{} {}\n", name, value));
799 }
800
801 output
802 }
803}
804
805fn current_timestamp() -> u64 {
807 SystemTime::now()
808 .duration_since(UNIX_EPOCH)
809 .unwrap_or_default()
810 .as_secs()
811}
812
813#[cfg(test)]
814mod tests {
815 use super::*;
816 use tokio;
817
818 #[tokio::test]
819 async fn test_monitoring_manager_creation() {
820 let config = MonitoringConfig::default();
821 let manager = MonitoringManager::new(config);
822
823 let metrics = manager.get_performance_metrics();
824 assert_eq!(metrics["auth_requests"], 0);
825 }
826
827 #[tokio::test]
828 async fn test_auth_request_recording() {
829 let config = MonitoringConfig::default();
830 let manager = MonitoringManager::new(config);
831
832 manager.record_auth_request().await;
833 manager.record_auth_request().await;
834
835 let metrics = manager.get_performance_metrics();
836 assert_eq!(metrics["auth_requests"], 2);
837 }
838
839 #[tokio::test]
840 async fn test_security_event_recording() {
841 let config = MonitoringConfig::default();
842 let manager = MonitoringManager::new(config);
843
844 let event = SecurityEvent {
845 event_type: SecurityEventType::FailedLogin,
846 user_id: Some("test_user".to_string()),
847 ip_address: Some("127.0.0.1".to_string()),
848 details: HashMap::new(),
849 severity: SecurityEventSeverity::Medium,
850 timestamp: current_timestamp(),
851 };
852
853 manager.record_security_event(event).await;
854
855 let events = manager.get_security_events(None).await;
856 assert_eq!(events.len(), 1);
857 assert_eq!(events[0].event_type, SecurityEventType::FailedLogin);
858 }
859
860 #[tokio::test]
861 async fn test_security_event_builder() {
862 let event = SecurityEvent::builder(
863 SecurityEventType::AccountLockout,
864 SecurityEventSeverity::High,
865 )
866 .user("user-42")
867 .ip("10.0.0.1")
868 .detail("reason", "too many failures")
869 .detail("attempts", "5")
870 .build();
871
872 assert_eq!(event.event_type, SecurityEventType::AccountLockout);
873 assert_eq!(event.severity, SecurityEventSeverity::High);
874 assert_eq!(event.user_id.as_deref(), Some("user-42"));
875 assert_eq!(event.ip_address.as_deref(), Some("10.0.0.1"));
876 assert_eq!(event.details.len(), 2);
877 assert_eq!(event.details["reason"], "too many failures");
878 assert!(event.timestamp > 0);
879 }
880
881 #[tokio::test]
882 async fn test_security_event_builder_minimal() {
883 let event = SecurityEvent::builder(
884 SecurityEventType::SystemError,
885 SecurityEventSeverity::Low,
886 )
887 .build();
888
889 assert_eq!(event.event_type, SecurityEventType::SystemError);
890 assert!(event.user_id.is_none());
891 assert!(event.ip_address.is_none());
892 assert!(event.details.is_empty());
893 }
894
895 #[tokio::test]
896 async fn test_health_check() {
897 let config = MonitoringConfig::default();
898 let manager = MonitoringManager::new(config);
899
900 let health_results = manager.health_check().await.unwrap();
901
902 assert!(health_results.contains_key("authentication"));
903 assert!(health_results.contains_key("storage"));
904 assert!(health_results.contains_key("tokens"));
905 }
906
907 #[tokio::test]
908 async fn test_prometheus_export() {
909 let config = MonitoringConfig::default();
910 let manager = MonitoringManager::new(config);
911
912 manager.record_auth_request().await;
913
914 let prometheus_output = manager.export_prometheus_metrics().await;
915
916 assert!(prometheus_output.contains("auth_auth_requests"));
917 assert!(prometheus_output.contains("# HELP"));
918 assert!(prometheus_output.contains("# TYPE"));
919 }
920}