1use std::{
8 collections::HashMap,
9 sync::Arc,
10 time::{Duration, SystemTime},
11};
12
13use serde::{Deserialize, Serialize};
14use tokio::sync::RwLock;
15use tracing::{info, error};
16
17use crate::nat_traversal_api::NatTraversalConfig;
19
20pub mod metrics;
21pub mod alerting;
22pub mod distributed_tracing;
23pub mod diagnostics;
24pub mod health;
25pub mod export;
26pub mod dashboards;
27pub mod structured_logging;
28pub mod error_recovery;
29
30pub use metrics::{ProductionMetricsCollector, MetricsConfig};
32pub use alerting::{ProductionAlertManager, AlertingConfig};
33pub use distributed_tracing::{DistributedTraceCollector, TracingConfig};
34pub use diagnostics::{DiagnosticEngine, DiagnosticsConfig};
35pub use health::{HealthMonitor, HealthConfig};
36pub use export::{ExportManager, ExportConfig as MonitoringExportConfig};
37pub use dashboards::{DashboardManager, DashboardConfig};
38
39pub struct MonitoringSystem {
41 metrics_collector: Arc<ProductionMetricsCollector>,
43 alert_manager: Arc<ProductionAlertManager>,
45 trace_collector: Arc<DistributedTraceCollector>,
47 health_monitor: Arc<HealthMonitor>,
49 diagnostic_engine: Arc<DiagnosticEngine>,
51 export_manager: Arc<ExportManager>,
53 dashboard_manager: Arc<DashboardManager>,
55 config: MonitoringConfig,
57 state: Arc<RwLock<MonitoringState>>,
59}
60
61impl MonitoringSystem {
62 pub async fn new(config: MonitoringConfig) -> Result<Self, MonitoringError> {
64 info!("Initializing production monitoring system");
65
66 let metrics_collector = Arc::new(
68 ProductionMetricsCollector::new(config.metrics.clone()).await?
69 );
70
71 let alert_manager = Arc::new(
72 ProductionAlertManager::new(config.alerting.clone()).await?
73 );
74
75 let trace_collector = Arc::new(
76 DistributedTraceCollector::new(config.tracing.clone()).await?
77 );
78
79 let health_monitor = Arc::new(
80 HealthMonitor::new(config.health.clone()).await?
81 );
82
83 let diagnostic_engine = Arc::new(
84 DiagnosticEngine::new(config.diagnostics.clone()).await?
85 );
86
87 let export_manager = Arc::new(
88 ExportManager::new(config.export.clone()).await?
89 );
90
91 let dashboard_manager = Arc::new(
92 DashboardManager::new(config.dashboards.clone()).await?
93 );
94
95 let state = Arc::new(RwLock::new(MonitoringState::new()));
96
97 Ok(Self {
98 metrics_collector,
99 alert_manager,
100 trace_collector,
101 health_monitor,
102 diagnostic_engine,
103 export_manager,
104 dashboard_manager,
105 config,
106 state,
107 })
108 }
109
110 pub async fn start(&self) -> Result<(), MonitoringError> {
112 info!("Starting production monitoring system");
113
114 {
116 let mut state = self.state.write().await;
117 state.start_time = Some(SystemTime::now());
118 state.status = MonitoringStatus::Starting;
119 }
120
121 self.metrics_collector.start().await?;
123 self.trace_collector.start().await?;
124 self.health_monitor.start().await?;
125 self.diagnostic_engine.start().await?;
126 self.alert_manager.start().await?;
127 self.export_manager.start().await?;
128 self.dashboard_manager.start().await?;
129
130 {
132 let mut state = self.state.write().await;
133 state.status = MonitoringStatus::Running;
134 }
135
136 info!("Production monitoring system started successfully");
137 Ok(())
138 }
139
140 pub async fn stop(&self) -> Result<(), MonitoringError> {
142 info!("Stopping production monitoring system");
143
144 {
146 let mut state = self.state.write().await;
147 state.status = MonitoringStatus::Stopping;
148 }
149
150 self.dashboard_manager.stop().await?;
152 self.export_manager.stop().await?;
153 self.alert_manager.stop().await?;
154 self.diagnostic_engine.stop().await?;
155 self.health_monitor.stop().await?;
156 self.trace_collector.stop().await?;
157 self.metrics_collector.stop().await?;
158
159 {
161 let mut state = self.state.write().await;
162 state.status = MonitoringStatus::Stopped;
163 state.stop_time = Some(SystemTime::now());
164 }
165
166 info!("Production monitoring system stopped");
167 Ok(())
168 }
169
170 pub async fn record_nat_attempt(&self, attempt: NatTraversalAttempt) -> Result<(), MonitoringError> {
172 self.metrics_collector.record_nat_attempt(&attempt).await?;
174
175 if self.config.tracing.enabled {
177 self.trace_collector.start_nat_trace(&attempt).await?;
178 }
179
180 self.alert_manager.evaluate_nat_attempt(&attempt).await?;
182
183 Ok(())
184 }
185
186 pub async fn record_nat_result(&self, result: NatTraversalResult) -> Result<(), MonitoringError> {
188 self.metrics_collector.record_nat_result(&result).await?;
190
191 if self.config.tracing.enabled {
193 self.trace_collector.complete_nat_trace(&result).await?;
194 }
195
196 self.health_monitor.update_nat_health(&result).await?;
198
199 self.alert_manager.evaluate_nat_result(&result).await?;
201
202 if !result.success {
204 self.diagnostic_engine.analyze_failure(&result).await?;
205 }
206
207 Ok(())
208 }
209
210 pub async fn get_status(&self) -> MonitoringSystemStatus {
212 let state = self.state.read().await;
213 let uptime = state.start_time.map(|start| start.elapsed().unwrap_or_default());
214
215 MonitoringSystemStatus {
216 status: state.status.clone(),
217 uptime,
218 subsystems: SubsystemStatus {
219 metrics: self.metrics_collector.get_status().await,
220 alerting: self.alert_manager.get_status().await,
221 tracing: self.trace_collector.get_status().await,
222 health: self.health_monitor.get_status().await,
223 diagnostics: self.diagnostic_engine.get_status().await,
224 export: self.export_manager.get_status().await,
225 dashboards: self.dashboard_manager.get_status().await,
226 },
227 metrics_summary: self.get_metrics_summary().await,
228 }
229 }
230
231 async fn get_metrics_summary(&self) -> MetricsSummary {
233 self.metrics_collector.get_summary().await
234 }
235
236 pub async fn trigger_diagnostic(&self, diagnostic_type: DiagnosticType) -> Result<DiagnosticReport, MonitoringError> {
238 self.diagnostic_engine.run_diagnostic(diagnostic_type).await
239 }
240
241 pub async fn health_check(&self) -> HealthCheckResult {
243 self.health_monitor.comprehensive_health_check().await
244 }
245}
246
247#[derive(Debug, Serialize, Deserialize)]
249pub struct HealthCheckResult {
250 pub status: HealthStatus,
252 pub components: HashMap<String, ComponentHealth>,
254 pub timestamp: SystemTime,
256 pub score: u8,
258}
259
260#[derive(Debug, Clone, Serialize, Deserialize)]
262pub enum HealthStatus {
263 Healthy,
264 Degraded,
265 Unhealthy,
266 Unknown,
267}
268
269#[derive(Debug, Serialize, Deserialize)]
271pub struct ComponentHealth {
272 pub status: HealthStatus,
274 pub message: String,
276 pub response_time_ms: u64,
278 pub error_count: u64,
280}
281
282#[derive(Debug, Clone, Serialize, Deserialize)]
284pub enum DiagnosticType {
285 NatTraversalFailure,
286 ConnectionPerformance,
287 NetworkConnectivity,
288 SystemHealth,
289 SecurityAudit,
290}
291
292#[derive(Debug, Serialize, Deserialize)]
294pub struct DiagnosticReport {
295 pub id: String,
297 pub diagnostic_type: DiagnosticType,
299 pub timestamp: SystemTime,
301 pub severity: DiagnosticSeverity,
303 pub findings: Vec<DiagnosticFinding>,
305 pub recommendations: Vec<DiagnosticRecommendation>,
307 pub metadata: HashMap<String, String>,
309}
310
311#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
313pub enum DiagnosticSeverity {
314 Info,
315 Warning,
316 Error,
317 Critical,
318}
319
320#[derive(Debug, Serialize, Deserialize)]
322pub struct DiagnosticFinding {
323 pub id: String,
325 pub title: String,
327 pub description: String,
329 pub severity: DiagnosticSeverity,
331 pub evidence: Vec<String>,
333 pub confidence: u8,
335}
336
337#[derive(Debug, Serialize, Deserialize)]
339pub struct DiagnosticRecommendation {
340 pub id: String,
342 pub title: String,
344 pub description: String,
346 pub priority: RecommendationPriority,
348 pub steps: Vec<String>,
350 pub impact: String,
352}
353
354#[derive(Debug, Clone, Serialize, Deserialize)]
356pub enum RecommendationPriority {
357 Low,
358 Medium,
359 High,
360 Critical,
361}
362
363#[derive(Debug, Clone, Serialize, Deserialize)]
365pub struct MonitoringConfig {
366 pub metrics: MetricsConfig,
368 pub alerting: AlertingConfig,
370 pub tracing: TracingConfig,
372 pub health: HealthConfig,
374 pub diagnostics: DiagnosticsConfig,
376 pub export: MonitoringExportConfig,
378 pub dashboards: DashboardConfig,
380 pub global: GlobalConfig,
382}
383
384impl Default for MonitoringConfig {
385 fn default() -> Self {
386 Self {
387 metrics: MetricsConfig::default(),
388 alerting: AlertingConfig::default(),
389 tracing: TracingConfig::default(),
390 health: HealthConfig::default(),
391 diagnostics: DiagnosticsConfig::default(),
392 export: MonitoringExportConfig::default(),
393 dashboards: DashboardConfig::default(),
394 global: GlobalConfig::default(),
395 }
396 }
397}
398
399#[derive(Debug, Clone, Serialize, Deserialize)]
401pub struct GlobalConfig {
402 pub service_name: String,
404 pub service_version: String,
406 pub environment: String,
408 pub region: String,
410 pub instance_id: String,
412 pub max_overhead_percent: f32,
414 pub security: SecurityConfig,
416}
417
418impl Default for GlobalConfig {
419 fn default() -> Self {
420 Self {
421 service_name: "ant-quic".to_string(),
422 service_version: "0.2.1".to_string(),
423 environment: "dev".to_string(),
424 region: "local".to_string(),
425 instance_id: "default".to_string(),
426 max_overhead_percent: 1.0,
427 security: SecurityConfig::default(),
428 }
429 }
430}
431
432#[derive(Debug, Clone, Serialize, Deserialize)]
434pub struct SecurityConfig {
435 pub anonymize_data: bool,
437 pub audit_logging: bool,
439 pub access_control: AccessControlConfig,
441 pub encryption: EncryptionConfig,
443}
444
445impl Default for SecurityConfig {
446 fn default() -> Self {
447 Self {
448 anonymize_data: true,
449 audit_logging: true,
450 access_control: AccessControlConfig::default(),
451 encryption: EncryptionConfig::default(),
452 }
453 }
454}
455
456#[derive(Debug, Clone, Serialize, Deserialize)]
458pub struct AccessControlConfig {
459 pub enabled: bool,
461 pub provider: AuthProvider,
463 pub required_permissions: Vec<String>,
465}
466
467impl Default for AccessControlConfig {
468 fn default() -> Self {
469 Self {
470 enabled: true,
471 provider: AuthProvider::OAuth2,
472 required_permissions: vec!["monitoring:read".to_string()],
473 }
474 }
475}
476
477#[derive(Debug, Clone, Serialize, Deserialize)]
479pub enum AuthProvider {
480 OAuth2,
481 OIDC,
482 ApiKey,
483 Certificate,
484}
485
486#[derive(Debug, Clone, Serialize, Deserialize)]
488pub struct EncryptionConfig {
489 pub tls_enabled: bool,
491 pub at_rest_encryption: bool,
493 pub key_rotation_period: Duration,
495}
496
497impl Default for EncryptionConfig {
498 fn default() -> Self {
499 Self {
500 tls_enabled: true,
501 at_rest_encryption: true,
502 key_rotation_period: Duration::from_secs(86400 * 30), }
504 }
505}
506
507#[derive(Debug, Clone, Serialize, Deserialize)]
509pub struct NatTraversalAttempt {
510 pub attempt_id: String,
512 pub timestamp: SystemTime,
514 pub client_info: EndpointInfo,
516 pub server_info: EndpointInfo,
518 pub nat_config: NatTraversalConfig,
520 pub bootstrap_nodes: Vec<String>,
522 pub network_conditions: NetworkConditions,
524}
525
526#[derive(Debug, Clone, Serialize, Deserialize)]
528pub struct NatTraversalResult {
529 pub attempt_id: String,
531 pub success: bool,
533 pub duration: Duration,
535 pub connection_info: Option<ConnectionInfo>,
537 pub error_info: Option<ErrorInfo>,
539 pub performance_metrics: PerformanceMetrics,
541 pub candidates_used: Vec<CandidateInfo>,
543}
544
545#[derive(Debug, Clone, Serialize, Deserialize)]
547pub struct EndpointInfo {
548 pub id: String,
550 pub role: EndpointRole,
552 pub address_hash: String,
554 pub nat_type: Option<NatType>,
556 pub region: Option<String>,
558}
559
560#[derive(Debug, Clone, Serialize, Deserialize)]
562pub struct ConnectionInfo {
563 pub path: ConnectionPath,
565 pub quality: ConnectionQuality,
567 pub protocol_info: ProtocolInfo,
569}
570
571#[derive(Debug, Clone, Serialize, Deserialize)]
573pub struct ErrorInfo {
574 pub error_code: String,
576 pub error_category: ErrorCategory,
578 pub error_message: String,
580 pub error_context: HashMap<String, String>,
582 pub recovery_suggestions: Vec<String>,
584}
585
586#[derive(Debug, Clone, Serialize, Deserialize)]
588pub struct PerformanceMetrics {
589 pub connection_time_ms: u64,
591 pub first_candidate_time_ms: u64,
593 pub success_time_ms: Option<u64>,
595 pub candidates_tried: u32,
597 pub round_trips: u32,
599 pub setup_bytes: u64,
601}
602
603#[derive(Debug, Clone, Serialize, Deserialize)]
605pub struct NetworkConditions {
606 pub rtt_ms: Option<u32>,
608 pub packet_loss_rate: Option<f32>,
610 pub bandwidth_mbps: Option<u32>,
612 pub congestion_level: CongestionLevel,
614}
615
616#[derive(Debug, Clone, Serialize, Deserialize)]
618pub struct ConnectionPath {
619 pub path_type: PathType,
621 pub hops: Vec<HopInfo>,
623 pub quality_score: f32,
625}
626
627#[derive(Debug, Clone, Serialize, Deserialize)]
629pub struct ConnectionQuality {
630 pub latency_ms: u32,
632 pub jitter_ms: u32,
634 pub throughput_mbps: f32,
636 pub stability_score: f32,
638}
639
640#[derive(Debug, Clone, Serialize, Deserialize)]
642pub struct ProtocolInfo {
643 pub quic_version: String,
645 pub cipher: String,
647 pub extensions: Vec<String>,
649}
650
651#[derive(Debug, Clone, Serialize, Deserialize)]
653pub struct CandidateInfo {
654 pub candidate_type: CandidateType,
656 pub priority: u32,
658 pub success: bool,
660 pub test_time_ms: u64,
662}
663
664#[derive(Debug, Clone, Serialize, Deserialize)]
666pub struct HopInfo {
667 pub hop_id: String,
669 pub hop_type: HopType,
671 pub latency_ms: u32,
673}
674
675#[derive(Debug, Clone, Serialize, Deserialize)]
677pub enum EndpointRole {
678 Client,
679 Server,
680 Bootstrap,
681 Relay,
682}
683
684#[derive(Debug, Clone, Serialize, Deserialize)]
686pub enum NatType {
687 FullCone,
688 RestrictedCone,
689 PortRestrictedCone,
690 Symmetric,
691 CarrierGrade,
692 DoubleNat,
693 None,
694}
695
696#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
698pub enum ErrorCategory {
699 NetworkConnectivity,
700 NatTraversal,
701 Authentication,
702 Protocol,
703 Timeout,
704 ResourceExhaustion,
705 Configuration,
706 Unknown,
707}
708
709#[derive(Debug, Clone, Serialize, Deserialize)]
711pub enum CongestionLevel {
712 Low,
713 Medium,
714 High,
715 Critical,
716}
717
718#[derive(Debug, Clone, Serialize, Deserialize)]
720pub enum PathType {
721 Direct,
722 NatTraversed,
723 Relayed,
724 TurnRelayed,
725}
726
727#[derive(Debug, Clone, Serialize, Deserialize)]
729pub enum CandidateType {
730 Host,
731 ServerReflexive,
732 PeerReflexive,
733 Relay,
734}
735
736#[derive(Debug, Clone, Serialize, Deserialize)]
738pub enum HopType {
739 Router,
740 Nat,
741 Firewall,
742 Proxy,
743 Relay,
744}
745
746#[derive(Debug)]
748struct MonitoringState {
749 status: MonitoringStatus,
751 start_time: Option<SystemTime>,
753 stop_time: Option<SystemTime>,
755 error_count: u64,
757 last_error: Option<String>,
759}
760
761impl MonitoringState {
762 fn new() -> Self {
763 Self {
764 status: MonitoringStatus::Stopped,
765 start_time: None,
766 stop_time: None,
767 error_count: 0,
768 last_error: None,
769 }
770 }
771}
772
773#[derive(Debug, Clone, Serialize, Deserialize)]
775pub enum MonitoringStatus {
776 Stopped,
777 Starting,
778 Running,
779 Stopping,
780 Error,
781}
782
783#[derive(Debug, Serialize, Deserialize)]
785pub struct MonitoringSystemStatus {
786 pub status: MonitoringStatus,
788 pub uptime: Option<Duration>,
790 pub subsystems: SubsystemStatus,
792 pub metrics_summary: MetricsSummary,
794}
795
796#[derive(Debug, Serialize, Deserialize)]
798pub struct SubsystemStatus {
799 pub metrics: String,
801 pub alerting: String,
803 pub tracing: String,
805 pub health: String,
807 pub diagnostics: String,
809 pub export: String,
811 pub dashboards: String,
813}
814
815#[derive(Debug, Serialize, Deserialize)]
817pub struct MetricsSummary {
818 pub nat_attempts_last_hour: u64,
820 pub success_rate_last_hour: f32,
822 pub avg_connection_time_ms: u64,
824 pub active_connections: u64,
826 pub error_rate_last_hour: f32,
828}
829
830#[derive(Debug, Clone, thiserror::Error)]
832pub enum MonitoringError {
833 #[error("Metrics collection error: {0}")]
834 MetricsError(String),
835
836 #[error("Alerting error: {0}")]
837 AlertingError(String),
838
839 #[error("Tracing error: {0}")]
840 TracingError(String),
841
842 #[error("Health monitoring error: {0}")]
843 HealthError(String),
844
845 #[error("Diagnostics error: {0}")]
846 DiagnosticsError(String),
847
848 #[error("Export error: {0}")]
849 ExportError(String),
850
851 #[error("Configuration error: {0}")]
852 ConfigError(String),
853
854 #[error("System error: {0}")]
855 SystemError(String),
856}
857
858#[cfg(test)]
859mod tests {
860 use super::*;
861
862 #[tokio::test]
863 async fn test_monitoring_system_creation() {
864 let config = MonitoringConfig::default();
865 let monitoring = MonitoringSystem::new(config).await.unwrap();
866
867 let status = monitoring.get_status().await;
868 assert!(matches!(status.status, MonitoringStatus::Stopped));
869 }
870
871 #[test]
872 fn test_config_serialization() {
873 let config = MonitoringConfig::default();
874 let json = serde_json::to_string_pretty(&config).unwrap();
875 let _deserialized: MonitoringConfig = serde_json::from_str(&json).unwrap();
876 }
877}