1use crate::errors::Result;
17use serde::{Deserialize, Serialize};
18use std::collections::HashMap;
19use std::sync::Arc;
20use std::sync::atomic::{AtomicU64, Ordering};
21use std::time::{Duration, SystemTime, UNIX_EPOCH};
22use tokio::sync::RwLock;
23use tracing::warn;
24
25pub mod alerts;
26pub mod collectors;
27pub mod exporters;
28pub mod health;
29
30#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct MonitoringConfig {
33 pub enabled: bool,
35 pub collection_interval: u64,
37 pub max_history_size: usize,
39 pub enable_performance_metrics: bool,
41 pub enable_security_metrics: bool,
43 pub enable_health_checks: bool,
45 pub external_endpoints: Vec<String>,
47}
48
49impl Default for MonitoringConfig {
50 fn default() -> Self {
51 Self {
52 enabled: true,
53 collection_interval: 60, max_history_size: 1000,
55 enable_performance_metrics: true,
56 enable_security_metrics: true,
57 enable_health_checks: true,
58 external_endpoints: vec![],
59 }
60 }
61}
62
63#[derive(Debug, Clone, Serialize, Deserialize)]
65pub struct MetricDataPoint {
66 pub name: String,
68 pub value: f64,
70 pub timestamp: u64,
72 pub labels: HashMap<String, String>,
74 pub metric_type: MetricType,
76}
77
78#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
80pub enum MetricType {
81 Counter,
83 Gauge,
85 Histogram,
87 Summary,
89}
90
91#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct SecurityEvent {
94 pub event_type: SecurityEventType,
96 pub user_id: Option<String>,
98 pub ip_address: Option<String>,
100 pub details: HashMap<String, String>,
102 pub severity: SecurityEventSeverity,
104 pub timestamp: u64,
106}
107
108#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
110pub enum SecurityEventType {
111 FailedLogin,
113 AccountLockout,
115 PrivilegeEscalation,
117 UnusualActivity,
119 TokenManipulation,
121 ConfigurationChange,
123 SystemError,
125}
126
127#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd)]
129pub enum SecurityEventSeverity {
130 Low = 1,
132 Medium = 2,
134 High = 3,
136 Critical = 4,
138}
139
140#[derive(Debug, Clone)]
142pub struct PerformanceMetrics {
143 pub auth_requests: Arc<AtomicU64>,
145 pub auth_successes: Arc<AtomicU64>,
147 pub auth_failures: Arc<AtomicU64>,
149 pub token_creations: Arc<AtomicU64>,
151 pub token_validations: Arc<AtomicU64>,
153 pub active_sessions: Arc<AtomicU64>,
155 pub mfa_challenges: Arc<AtomicU64>,
157 pub avg_response_time: Arc<AtomicU64>,
159}
160
161impl Default for PerformanceMetrics {
162 fn default() -> Self {
163 Self {
164 auth_requests: Arc::new(AtomicU64::new(0)),
165 auth_successes: Arc::new(AtomicU64::new(0)),
166 auth_failures: Arc::new(AtomicU64::new(0)),
167 token_creations: Arc::new(AtomicU64::new(0)),
168 token_validations: Arc::new(AtomicU64::new(0)),
169 active_sessions: Arc::new(AtomicU64::new(0)),
170 mfa_challenges: Arc::new(AtomicU64::new(0)),
171 avg_response_time: Arc::new(AtomicU64::new(0)),
172 }
173 }
174}
175
176#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
178pub enum HealthStatus {
179 Healthy,
181 Degraded,
183 Unhealthy,
185 Critical,
187}
188
189#[derive(Debug, Clone, Serialize, Deserialize)]
191pub struct HealthCheckResult {
192 pub component: String,
194 pub status: HealthStatus,
196 pub message: String,
198 pub timestamp: u64,
200 pub response_time: u64,
202}
203
204pub struct MonitoringManager {
206 config: MonitoringConfig,
208 performance: PerformanceMetrics,
210 metrics_history: Arc<RwLock<Vec<MetricDataPoint>>>,
212 security_events: Arc<RwLock<Vec<SecurityEvent>>>,
214 health_results: Arc<RwLock<HashMap<String, HealthCheckResult>>>,
216}
217
218impl MonitoringManager {
219 pub fn new(config: MonitoringConfig) -> Self {
221 Self {
222 config,
223 performance: PerformanceMetrics::default(),
224 metrics_history: Arc::new(RwLock::new(Vec::new())),
225 security_events: Arc::new(RwLock::new(Vec::new())),
226 health_results: Arc::new(RwLock::new(HashMap::new())),
227 }
228 }
229
230 pub async fn record_auth_request(&self) {
232 self.performance
233 .auth_requests
234 .fetch_add(1, Ordering::Relaxed);
235
236 if self.config.enable_performance_metrics {
237 self.record_metric(MetricDataPoint {
238 name: "auth_requests_total".to_string(),
239 value: self.performance.auth_requests.load(Ordering::Relaxed) as f64,
240 timestamp: current_timestamp(),
241 labels: HashMap::new(),
242 metric_type: MetricType::Counter,
243 })
244 .await;
245 }
246 }
247
248 pub async fn record_auth_success(&self, user_id: &str, duration: Duration) {
250 self.performance
251 .auth_successes
252 .fetch_add(1, Ordering::Relaxed);
253 self.update_avg_response_time(duration).await;
254
255 if self.config.enable_performance_metrics {
256 let mut labels = HashMap::new();
257 labels.insert("result".to_string(), "success".to_string());
258 labels.insert("user_id".to_string(), user_id.to_string());
259
260 self.record_metric(MetricDataPoint {
261 name: "auth_attempts_total".to_string(),
262 value: 1.0,
263 timestamp: current_timestamp(),
264 labels,
265 metric_type: MetricType::Counter,
266 })
267 .await;
268 }
269 }
270
271 pub async fn record_auth_failure(&self, user_id: Option<&str>, reason: &str) {
273 self.performance
274 .auth_failures
275 .fetch_add(1, Ordering::Relaxed);
276
277 if self.config.enable_security_metrics {
278 let mut details = HashMap::new();
279 details.insert("reason".to_string(), reason.to_string());
280 if let Some(user) = user_id {
281 details.insert("user_id".to_string(), user.to_string());
282 }
283
284 let security_event = SecurityEvent {
285 event_type: SecurityEventType::FailedLogin,
286 user_id: user_id.map(|s| s.to_string()),
287 ip_address: None, details,
289 severity: SecurityEventSeverity::Medium,
290 timestamp: current_timestamp(),
291 };
292
293 self.record_security_event(security_event).await;
294 }
295
296 if self.config.enable_performance_metrics {
297 let mut labels = HashMap::new();
298 labels.insert("result".to_string(), "failure".to_string());
299 labels.insert("reason".to_string(), reason.to_string());
300
301 self.record_metric(MetricDataPoint {
302 name: "auth_attempts_total".to_string(),
303 value: 1.0,
304 timestamp: current_timestamp(),
305 labels,
306 metric_type: MetricType::Counter,
307 })
308 .await;
309 }
310 }
311
312 pub async fn record_token_creation(&self, token_type: &str) {
314 self.performance
315 .token_creations
316 .fetch_add(1, Ordering::Relaxed);
317
318 if self.config.enable_performance_metrics {
319 let mut labels = HashMap::new();
320 labels.insert("token_type".to_string(), token_type.to_string());
321
322 self.record_metric(MetricDataPoint {
323 name: "tokens_created_total".to_string(),
324 value: 1.0,
325 timestamp: current_timestamp(),
326 labels,
327 metric_type: MetricType::Counter,
328 })
329 .await;
330 }
331 }
332
333 pub async fn record_token_validation(&self, valid: bool) {
335 self.performance
336 .token_validations
337 .fetch_add(1, Ordering::Relaxed);
338
339 if self.config.enable_performance_metrics {
340 let mut labels = HashMap::new();
341 labels.insert(
342 "result".to_string(),
343 if valid { "valid" } else { "invalid" }.to_string(),
344 );
345
346 self.record_metric(MetricDataPoint {
347 name: "tokens_validated_total".to_string(),
348 value: 1.0,
349 timestamp: current_timestamp(),
350 labels,
351 metric_type: MetricType::Counter,
352 })
353 .await;
354 }
355 }
356
357 pub async fn update_session_count(&self, count: u64) {
359 self.performance
360 .active_sessions
361 .store(count, Ordering::Relaxed);
362
363 if self.config.enable_performance_metrics {
364 self.record_metric(MetricDataPoint {
365 name: "active_sessions".to_string(),
366 value: count as f64,
367 timestamp: current_timestamp(),
368 labels: HashMap::new(),
369 metric_type: MetricType::Gauge,
370 })
371 .await;
372 }
373 }
374
375 pub async fn record_mfa_challenge(&self, method: &str) {
377 self.performance
378 .mfa_challenges
379 .fetch_add(1, Ordering::Relaxed);
380
381 if self.config.enable_performance_metrics {
382 let mut labels = HashMap::new();
383 labels.insert("method".to_string(), method.to_string());
384
385 self.record_metric(MetricDataPoint {
386 name: "mfa_challenges_total".to_string(),
387 value: 1.0,
388 timestamp: current_timestamp(),
389 labels,
390 metric_type: MetricType::Counter,
391 })
392 .await;
393 }
394 }
395
396 pub async fn record_security_event(&self, event: SecurityEvent) {
398 if !self.config.enable_security_metrics {
399 return;
400 }
401
402 let mut events = self.security_events.write().await;
403 events.push(event.clone());
404
405 if events.len() > self.config.max_history_size {
407 events.remove(0);
408 }
409
410 tracing::warn!(
411 "Security event: {:?} - User: {:?}, Severity: {:?}",
412 event.event_type,
413 event.user_id,
414 event.severity
415 );
416
417 if event.severity == SecurityEventSeverity::Critical {
419 tracing::error!("CRITICAL security event: {:?}", event);
421 }
422 }
423
424 async fn record_metric(&self, metric: MetricDataPoint) {
426 if !self.config.enabled {
427 return;
428 }
429
430 let mut metrics = self.metrics_history.write().await;
431 metrics.push(metric);
432
433 if metrics.len() > self.config.max_history_size {
435 metrics.remove(0);
436 }
437 }
438
439 async fn update_avg_response_time(&self, duration: Duration) {
441 let current_avg = self.performance.avg_response_time.load(Ordering::Relaxed);
442 let new_time = duration.as_micros() as u64;
443
444 let updated_avg = if current_avg == 0 {
446 new_time
447 } else {
448 (current_avg + new_time) / 2
449 };
450
451 self.performance
452 .avg_response_time
453 .store(updated_avg, Ordering::Relaxed);
454 }
455
456 pub fn get_performance_metrics(&self) -> HashMap<String, u64> {
458 let mut metrics = HashMap::new();
459 metrics.insert(
460 "auth_requests".to_string(),
461 self.performance.auth_requests.load(Ordering::Relaxed),
462 );
463 metrics.insert(
464 "auth_successes".to_string(),
465 self.performance.auth_successes.load(Ordering::Relaxed),
466 );
467 metrics.insert(
468 "auth_failures".to_string(),
469 self.performance.auth_failures.load(Ordering::Relaxed),
470 );
471 metrics.insert(
472 "token_creations".to_string(),
473 self.performance.token_creations.load(Ordering::Relaxed),
474 );
475 metrics.insert(
476 "token_validations".to_string(),
477 self.performance.token_validations.load(Ordering::Relaxed),
478 );
479 metrics.insert(
480 "active_sessions".to_string(),
481 self.performance.active_sessions.load(Ordering::Relaxed),
482 );
483 metrics.insert(
484 "mfa_challenges".to_string(),
485 self.performance.mfa_challenges.load(Ordering::Relaxed),
486 );
487 metrics.insert(
488 "avg_response_time_us".to_string(),
489 self.performance.avg_response_time.load(Ordering::Relaxed),
490 );
491 metrics
492 }
493
494 pub async fn get_security_events(&self, limit: Option<usize>) -> Vec<SecurityEvent> {
496 let events = self.security_events.read().await;
497 let limit = limit.unwrap_or(100);
498
499 if events.len() <= limit {
500 events.clone()
501 } else {
502 events.iter().rev().take(limit).cloned().collect()
503 }
504 }
505
506 pub async fn get_metrics_history(&self, metric_name: Option<&str>) -> Vec<MetricDataPoint> {
508 let metrics = self.metrics_history.read().await;
509
510 if let Some(name) = metric_name {
511 metrics.iter().filter(|m| m.name == name).cloned().collect()
512 } else {
513 metrics.clone()
514 }
515 }
516
517 pub async fn health_check(&self) -> Result<HashMap<String, HealthCheckResult>> {
519 if !self.config.enable_health_checks {
520 return Ok(HashMap::new());
521 }
522
523 let mut results = HashMap::new();
524 let start_time = SystemTime::now();
525
526 let auth_health = self.check_auth_health().await;
528 results.insert("authentication".to_string(), auth_health);
529
530 let storage_health = self.check_storage_health().await;
532 results.insert("storage".to_string(), storage_health);
533
534 let token_health = self.check_token_health().await;
536 results.insert("tokens".to_string(), token_health);
537
538 let mut health_cache = self.health_results.write().await;
540 for (component, result) in &results {
541 health_cache.insert(component.clone(), result.clone());
542 }
543
544 let elapsed = start_time.elapsed().unwrap_or_default();
545 tracing::debug!("Health check completed in {:?}", elapsed);
546
547 Ok(results)
548 }
549
550 async fn check_auth_health(&self) -> HealthCheckResult {
552 let start_time = SystemTime::now();
553
554 let auth_requests = self.performance.auth_requests.load(Ordering::Relaxed);
556 let auth_failures = self.performance.auth_failures.load(Ordering::Relaxed);
557
558 let status = if auth_requests > 0 {
559 let failure_rate = (auth_failures as f64) / (auth_requests as f64);
560 if failure_rate > 0.5 {
561 HealthStatus::Unhealthy
562 } else if failure_rate > 0.2 {
563 HealthStatus::Degraded
564 } else {
565 HealthStatus::Healthy
566 }
567 } else {
568 HealthStatus::Healthy
569 };
570
571 let message = match status {
572 HealthStatus::Healthy => "Authentication system operating normally".to_string(),
573 HealthStatus::Degraded => format!(
574 "High failure rate: {:.1}%",
575 (auth_failures as f64 / auth_requests as f64) * 100.0
576 ),
577 HealthStatus::Unhealthy => format!(
578 "Critical failure rate: {:.1}%",
579 (auth_failures as f64 / auth_requests as f64) * 100.0
580 ),
581 HealthStatus::Critical => "Authentication system down".to_string(),
582 };
583
584 HealthCheckResult {
585 component: "authentication".to_string(),
586 status,
587 message,
588 timestamp: current_timestamp(),
589 response_time: start_time.elapsed().unwrap_or_default().as_millis() as u64,
590 }
591 }
592
593 async fn check_storage_health(&self) -> HealthCheckResult {
595 let start_time = SystemTime::now();
596
597 let status = match self.test_storage_connectivity().await {
599 Ok(response_time_ms) => {
600 if response_time_ms > 5000 {
601 HealthStatus::Degraded
602 } else {
603 HealthStatus::Healthy
604 }
605 }
606 Err(e) => {
607 warn!("Storage health check failed: {}", e);
608 HealthStatus::Critical
609 }
610 };
611
612 let message = match status {
613 HealthStatus::Healthy => "Storage system operational".to_string(),
614 HealthStatus::Degraded => "Storage system slow but operational".to_string(),
615 HealthStatus::Critical => "Storage system connectivity failed".to_string(),
616 HealthStatus::Unhealthy => "Storage system unhealthy".to_string(),
617 };
618
619 HealthCheckResult {
620 component: "storage".to_string(),
621 status,
622 message,
623 timestamp: current_timestamp(),
624 response_time: start_time.elapsed().unwrap_or_default().as_millis() as u64,
625 }
626 }
627
628 async fn test_storage_connectivity(
630 &self,
631 ) -> Result<u64, Box<dyn std::error::Error + Send + Sync>> {
632 let start_time = SystemTime::now();
633
634 match tokio::time::timeout(
637 std::time::Duration::from_secs(5),
638 self.attempt_storage_ping(),
639 )
640 .await
641 {
642 Ok(result) => {
643 result?;
644 let response_time = start_time.elapsed()?.as_millis() as u64;
645 Ok(response_time)
646 }
647 Err(_) => Err("Storage connectivity test timed out".into()),
648 }
649 }
650
651 async fn attempt_storage_ping(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
653 tokio::time::sleep(std::time::Duration::from_millis(10)).await;
665
666 Ok(())
673 }
674
675 async fn check_token_health(&self) -> HealthCheckResult {
677 let start_time = SystemTime::now();
678
679 let token_validations = self.performance.token_validations.load(Ordering::Relaxed);
680
681 HealthCheckResult {
682 component: "tokens".to_string(),
683 status: HealthStatus::Healthy,
684 message: format!(
685 "Token system operational - {} validations",
686 token_validations
687 ),
688 timestamp: current_timestamp(),
689 response_time: start_time.elapsed().unwrap_or_default().as_millis() as u64,
690 }
691 }
692
693 pub async fn export_prometheus_metrics(&self) -> String {
695 let mut output = String::new();
696
697 let metrics = self.get_performance_metrics();
698
699 for (name, value) in metrics {
700 output.push_str(&format!(
701 "# HELP auth_{} Authentication framework metric\n",
702 name
703 ));
704 output.push_str(&format!("# TYPE auth_{} counter\n", name));
705 output.push_str(&format!("auth_{} {}\n", name, value));
706 }
707
708 output
709 }
710}
711
712fn current_timestamp() -> u64 {
714 SystemTime::now()
715 .duration_since(UNIX_EPOCH)
716 .unwrap_or_default()
717 .as_secs()
718}
719
720#[cfg(test)]
721mod tests {
722 use super::*;
723 use tokio;
724
725 #[tokio::test]
726 async fn test_monitoring_manager_creation() {
727 let config = MonitoringConfig::default();
728 let manager = MonitoringManager::new(config);
729
730 let metrics = manager.get_performance_metrics();
731 assert_eq!(metrics["auth_requests"], 0);
732 }
733
734 #[tokio::test]
735 async fn test_auth_request_recording() {
736 let config = MonitoringConfig::default();
737 let manager = MonitoringManager::new(config);
738
739 manager.record_auth_request().await;
740 manager.record_auth_request().await;
741
742 let metrics = manager.get_performance_metrics();
743 assert_eq!(metrics["auth_requests"], 2);
744 }
745
746 #[tokio::test]
747 async fn test_security_event_recording() {
748 let config = MonitoringConfig::default();
749 let manager = MonitoringManager::new(config);
750
751 let event = SecurityEvent {
752 event_type: SecurityEventType::FailedLogin,
753 user_id: Some("test_user".to_string()),
754 ip_address: Some("127.0.0.1".to_string()),
755 details: HashMap::new(),
756 severity: SecurityEventSeverity::Medium,
757 timestamp: current_timestamp(),
758 };
759
760 manager.record_security_event(event).await;
761
762 let events = manager.get_security_events(None).await;
763 assert_eq!(events.len(), 1);
764 assert_eq!(events[0].event_type, SecurityEventType::FailedLogin);
765 }
766
767 #[tokio::test]
768 async fn test_health_check() {
769 let config = MonitoringConfig::default();
770 let manager = MonitoringManager::new(config);
771
772 let health_results = manager.health_check().await.unwrap();
773
774 assert!(health_results.contains_key("authentication"));
775 assert!(health_results.contains_key("storage"));
776 assert!(health_results.contains_key("tokens"));
777 }
778
779 #[tokio::test]
780 async fn test_prometheus_export() {
781 let config = MonitoringConfig::default();
782 let manager = MonitoringManager::new(config);
783
784 manager.record_auth_request().await;
785
786 let prometheus_output = manager.export_prometheus_metrics().await;
787
788 assert!(prometheus_output.contains("auth_auth_requests"));
789 assert!(prometheus_output.contains("# HELP"));
790 assert!(prometheus_output.contains("# TYPE"));
791 }
792}