1use anyhow::Result;
7use chrono::{DateTime, Utc};
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use std::fmt;
11use std::sync::Arc;
12use tokio::sync::RwLock;
13use tracing::{error, info, warn};
14
15#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct EnterpriseMonitoringConfig {
18 pub enabled: bool,
20 pub sla: SlaConfig,
22 pub alerting: AlertingConfig,
24 pub metrics: MetricsConfig,
26 pub health_checks: HealthCheckConfig,
28 pub profiling: ProfilingConfig,
30}
31
32impl Default for EnterpriseMonitoringConfig {
33 fn default() -> Self {
34 Self {
35 enabled: true,
36 sla: SlaConfig::default(),
37 alerting: AlertingConfig::default(),
38 metrics: MetricsConfig::default(),
39 health_checks: HealthCheckConfig::default(),
40 profiling: ProfilingConfig::default(),
41 }
42 }
43}
44
45#[derive(Debug, Clone, Serialize, Deserialize)]
47pub struct SlaConfig {
48 pub enabled: bool,
50 pub objectives: Vec<SlaObjective>,
52 pub reporting_interval_secs: u64,
54 pub breach_notification: BreachNotificationConfig,
56}
57
58impl Default for SlaConfig {
59 fn default() -> Self {
60 Self {
61 enabled: true,
62 objectives: vec![
63 SlaObjective {
64 name: "Availability".to_string(),
65 metric_type: SlaMetricType::Availability,
66 target_value: 99.99,
67 measurement_window: MeasurementWindow::Rolling30Days,
68 severity: SlaSeverity::Critical,
69 },
70 SlaObjective {
71 name: "Latency P99".to_string(),
72 metric_type: SlaMetricType::LatencyP99,
73 target_value: 10.0, measurement_window: MeasurementWindow::Rolling24Hours,
75 severity: SlaSeverity::High,
76 },
77 SlaObjective {
78 name: "Error Rate".to_string(),
79 metric_type: SlaMetricType::ErrorRate,
80 target_value: 0.01, measurement_window: MeasurementWindow::Rolling1Hour,
82 severity: SlaSeverity::High,
83 },
84 ],
85 reporting_interval_secs: 300, breach_notification: BreachNotificationConfig::default(),
87 }
88 }
89}
90
91#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct SlaObjective {
94 pub name: String,
96 pub metric_type: SlaMetricType,
98 pub target_value: f64,
100 pub measurement_window: MeasurementWindow,
102 pub severity: SlaSeverity,
104}
105
106#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
108pub enum SlaMetricType {
109 Availability,
111 LatencyP50,
113 LatencyP95,
115 LatencyP99,
117 Throughput,
119 ErrorRate,
121 ResponseTime,
123}
124
125impl fmt::Display for SlaMetricType {
126 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
127 match self {
128 SlaMetricType::Availability => write!(f, "Availability"),
129 SlaMetricType::LatencyP50 => write!(f, "Latency P50"),
130 SlaMetricType::LatencyP95 => write!(f, "Latency P95"),
131 SlaMetricType::LatencyP99 => write!(f, "Latency P99"),
132 SlaMetricType::Throughput => write!(f, "Throughput"),
133 SlaMetricType::ErrorRate => write!(f, "Error Rate"),
134 SlaMetricType::ResponseTime => write!(f, "Response Time"),
135 }
136 }
137}
138
139#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
141pub enum MeasurementWindow {
142 RealTime,
144 Rolling1Hour,
146 Rolling24Hours,
148 Rolling7Days,
150 Rolling30Days,
152 Custom(u64),
154}
155
156impl fmt::Display for MeasurementWindow {
157 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
158 match self {
159 MeasurementWindow::RealTime => write!(f, "Real-time"),
160 MeasurementWindow::Rolling1Hour => write!(f, "1 hour"),
161 MeasurementWindow::Rolling24Hours => write!(f, "24 hours"),
162 MeasurementWindow::Rolling7Days => write!(f, "7 days"),
163 MeasurementWindow::Rolling30Days => write!(f, "30 days"),
164 MeasurementWindow::Custom(secs) => write!(f, "{} seconds", secs),
165 }
166 }
167}
168
169#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
171pub enum SlaSeverity {
172 Low,
173 Medium,
174 High,
175 Critical,
176}
177
178impl fmt::Display for SlaSeverity {
179 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
180 match self {
181 SlaSeverity::Low => write!(f, "LOW"),
182 SlaSeverity::Medium => write!(f, "MEDIUM"),
183 SlaSeverity::High => write!(f, "HIGH"),
184 SlaSeverity::Critical => write!(f, "CRITICAL"),
185 }
186 }
187}
188
189#[derive(Debug, Clone, Serialize, Deserialize)]
191pub struct BreachNotificationConfig {
192 pub enabled: bool,
194 pub channels: Vec<NotificationChannel>,
196 pub escalation: EscalationPolicy,
198}
199
200impl Default for BreachNotificationConfig {
201 fn default() -> Self {
202 Self {
203 enabled: true,
204 channels: vec![NotificationChannel::Email {
205 recipients: vec!["ops@example.com".to_string()],
206 }],
207 escalation: EscalationPolicy::default(),
208 }
209 }
210}
211
212#[derive(Debug, Clone, Serialize, Deserialize)]
214pub enum NotificationChannel {
215 Email { recipients: Vec<String> },
216 Slack { webhook_url: String },
217 PagerDuty { service_key: String },
218 Webhook { url: String },
219 SMS { phone_numbers: Vec<String> },
220}
221
222#[derive(Debug, Clone, Serialize, Deserialize)]
224pub struct EscalationPolicy {
225 pub levels: Vec<EscalationLevel>,
227}
228
229impl Default for EscalationPolicy {
230 fn default() -> Self {
231 Self {
232 levels: vec![
233 EscalationLevel {
234 level: 1,
235 wait_minutes: 5,
236 channels: vec![NotificationChannel::Email {
237 recipients: vec!["ops@example.com".to_string()],
238 }],
239 },
240 EscalationLevel {
241 level: 2,
242 wait_minutes: 15,
243 channels: vec![NotificationChannel::Email {
244 recipients: vec!["manager@example.com".to_string()],
245 }],
246 },
247 ],
248 }
249 }
250}
251
252#[derive(Debug, Clone, Serialize, Deserialize)]
254pub struct EscalationLevel {
255 pub level: u32,
257 pub wait_minutes: u32,
259 pub channels: Vec<NotificationChannel>,
261}
262
263#[derive(Debug, Clone, Serialize, Deserialize)]
265pub struct AlertingConfig {
266 pub enabled: bool,
268 pub rules: Vec<AlertRule>,
270 pub aggregation_window_secs: u64,
272 pub deduplication_enabled: bool,
274}
275
276impl Default for AlertingConfig {
277 fn default() -> Self {
278 Self {
279 enabled: true,
280 rules: vec![],
281 aggregation_window_secs: 60,
282 deduplication_enabled: true,
283 }
284 }
285}
286
287#[derive(Debug, Clone, Serialize, Deserialize)]
289pub struct AlertRule {
290 pub id: String,
292 pub name: String,
294 pub condition: AlertCondition,
296 pub severity: AlertSeverity,
298 pub channels: Vec<NotificationChannel>,
300 pub enabled: bool,
302}
303
304#[derive(Debug, Clone, Serialize, Deserialize)]
306pub enum AlertCondition {
307 Threshold {
309 metric: String,
310 operator: ComparisonOperator,
311 value: f64,
312 duration_secs: u64,
313 },
314 Anomaly { metric: String, sensitivity: f64 },
316 RateOfChange {
318 metric: String,
319 threshold_percent: f64,
320 window_secs: u64,
321 },
322}
323
324#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
326pub enum ComparisonOperator {
327 GreaterThan,
328 LessThan,
329 Equals,
330 NotEquals,
331 GreaterThanOrEqual,
332 LessThanOrEqual,
333}
334
335#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
337pub enum AlertSeverity {
338 Info,
339 Warning,
340 Error,
341 Critical,
342}
343
344impl fmt::Display for AlertSeverity {
345 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
346 match self {
347 AlertSeverity::Info => write!(f, "INFO"),
348 AlertSeverity::Warning => write!(f, "WARNING"),
349 AlertSeverity::Error => write!(f, "ERROR"),
350 AlertSeverity::Critical => write!(f, "CRITICAL"),
351 }
352 }
353}
354
355#[derive(Debug, Clone, Serialize, Deserialize)]
357pub struct MetricsConfig {
358 pub enabled: bool,
360 pub collection_interval_secs: u64,
362 pub metrics: Vec<MetricDefinition>,
364 pub export: MetricsExportConfig,
366}
367
368impl Default for MetricsConfig {
369 fn default() -> Self {
370 Self {
371 enabled: true,
372 collection_interval_secs: 10,
373 metrics: vec![],
374 export: MetricsExportConfig::default(),
375 }
376 }
377}
378
379#[derive(Debug, Clone, Serialize, Deserialize)]
381pub struct MetricDefinition {
382 pub name: String,
384 pub metric_type: MetricType,
386 pub description: String,
388 pub labels: Vec<String>,
390}
391
392#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
394pub enum MetricType {
395 Counter,
396 Gauge,
397 Histogram,
398 Summary,
399}
400
401#[derive(Debug, Clone, Serialize, Deserialize)]
403pub struct MetricsExportConfig {
404 pub format: MetricsFormat,
406 pub endpoints: Vec<MetricsEndpoint>,
408}
409
410impl Default for MetricsExportConfig {
411 fn default() -> Self {
412 Self {
413 format: MetricsFormat::Prometheus,
414 endpoints: vec![],
415 }
416 }
417}
418
419#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
421pub enum MetricsFormat {
422 Prometheus,
423 OpenMetrics,
424 JSON,
425 StatsD,
426}
427
428#[derive(Debug, Clone, Serialize, Deserialize)]
430pub struct MetricsEndpoint {
431 pub endpoint_type: MetricsEndpointType,
433 pub url: String,
435 pub push_interval_secs: Option<u64>,
437}
438
439#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
441pub enum MetricsEndpointType {
442 Pull,
444 Push,
446}
447
448#[derive(Debug, Clone, Serialize, Deserialize)]
450pub struct HealthCheckConfig {
451 pub enabled: bool,
453 pub interval_secs: u64,
455 pub timeout_secs: u64,
457 pub endpoints: Vec<HealthCheckEndpoint>,
459}
460
461impl Default for HealthCheckConfig {
462 fn default() -> Self {
463 Self {
464 enabled: true,
465 interval_secs: 30,
466 timeout_secs: 5,
467 endpoints: vec![],
468 }
469 }
470}
471
472#[derive(Debug, Clone, Serialize, Deserialize)]
474pub struct HealthCheckEndpoint {
475 pub name: String,
477 pub check_type: HealthCheckType,
479 pub critical: bool,
481}
482
483#[derive(Debug, Clone, Serialize, Deserialize)]
485pub enum HealthCheckType {
486 TcpConnect { host: String, port: u16 },
488 Http { url: String, expected_status: u16 },
490 Database { connection_string: String },
492 Custom { command: String },
494}
495
496#[derive(Debug, Clone, Serialize, Deserialize)]
498pub struct ProfilingConfig {
499 pub enabled: bool,
501 pub cpu_profiling: bool,
503 pub memory_profiling: bool,
505 pub sampling_rate: u32,
507 pub duration_secs: u64,
509}
510
511impl Default for ProfilingConfig {
512 fn default() -> Self {
513 Self {
514 enabled: false,
515 cpu_profiling: true,
516 memory_profiling: true,
517 sampling_rate: 100,
518 duration_secs: 30,
519 }
520 }
521}
522
523pub struct EnterpriseMonitoringSystem {
525 config: EnterpriseMonitoringConfig,
526 sla_tracker: Arc<RwLock<SlaTracker>>,
527 alert_manager: Arc<RwLock<AlertManager>>,
528 metrics_collector: Arc<RwLock<MetricsCollector>>,
529}
530
531pub struct SlaTracker {
533 objectives: Vec<SlaObjective>,
534 measurements: HashMap<String, Vec<SlaMeasurement>>,
535 breaches: Vec<SlaBreach>,
536}
537
538#[derive(Debug, Clone, Serialize, Deserialize)]
540pub struct SlaMeasurement {
541 pub timestamp: DateTime<Utc>,
543 pub metric_type: SlaMetricType,
545 pub value: f64,
547 pub meets_objective: bool,
549}
550
551#[derive(Debug, Clone, Serialize, Deserialize)]
553pub struct SlaBreach {
554 pub breach_id: String,
556 pub objective_name: String,
558 pub metric_type: SlaMetricType,
560 pub target_value: f64,
562 pub actual_value: f64,
564 pub timestamp: DateTime<Utc>,
566 pub severity: SlaSeverity,
568 pub resolved: bool,
570 pub resolved_at: Option<DateTime<Utc>>,
572}
573
574impl SlaTracker {
575 pub fn new(objectives: Vec<SlaObjective>) -> Self {
576 Self {
577 objectives,
578 measurements: HashMap::new(),
579 breaches: Vec::new(),
580 }
581 }
582
583 pub fn record_measurement(&mut self, measurement: SlaMeasurement) {
584 let key = measurement.metric_type.to_string();
585 self.measurements.entry(key).or_default().push(measurement);
586 }
587
588 pub fn check_objectives(&mut self) -> Vec<SlaBreach> {
589 let mut new_breaches = Vec::new();
590
591 for objective in &self.objectives {
592 let key = objective.metric_type.to_string();
593 if let Some(measurements) = self.measurements.get(&key) {
594 if let Some(latest) = measurements.last() {
595 if !latest.meets_objective {
596 new_breaches.push(SlaBreach {
597 breach_id: uuid::Uuid::new_v4().to_string(),
598 objective_name: objective.name.clone(),
599 metric_type: objective.metric_type,
600 target_value: objective.target_value,
601 actual_value: latest.value,
602 timestamp: latest.timestamp,
603 severity: objective.severity,
604 resolved: false,
605 resolved_at: None,
606 });
607 }
608 }
609 }
610 }
611
612 self.breaches.extend(new_breaches.clone());
613 new_breaches
614 }
615}
616
617pub struct AlertManager {
619 rules: Vec<AlertRule>,
620 active_alerts: Vec<Alert>,
621}
622
623#[derive(Debug, Clone, Serialize, Deserialize)]
625pub struct Alert {
626 pub alert_id: String,
628 pub rule_id: String,
630 pub name: String,
632 pub severity: AlertSeverity,
634 pub triggered_at: DateTime<Utc>,
636 pub resolved: bool,
638 pub resolved_at: Option<DateTime<Utc>>,
640 pub details: HashMap<String, String>,
642}
643
644impl AlertManager {
645 pub fn new(rules: Vec<AlertRule>) -> Self {
646 Self {
647 rules,
648 active_alerts: Vec::new(),
649 }
650 }
651
652 pub fn evaluate_rules(&mut self, metrics: &HashMap<String, f64>) -> Vec<Alert> {
653 let mut new_alerts = Vec::new();
654
655 for rule in &self.rules {
656 if !rule.enabled {
657 continue;
658 }
659
660 if self.should_trigger_alert(rule, metrics) {
661 let alert = Alert {
662 alert_id: uuid::Uuid::new_v4().to_string(),
663 rule_id: rule.id.clone(),
664 name: rule.name.clone(),
665 severity: rule.severity,
666 triggered_at: Utc::now(),
667 resolved: false,
668 resolved_at: None,
669 details: HashMap::new(),
670 };
671 new_alerts.push(alert.clone());
672 self.active_alerts.push(alert);
673 }
674 }
675
676 new_alerts
677 }
678
679 fn should_trigger_alert(&self, _rule: &AlertRule, _metrics: &HashMap<String, f64>) -> bool {
680 false
682 }
683}
684
685pub struct MetricsCollector {
687 metrics: HashMap<String, Vec<MetricValue>>,
688}
689
690#[derive(Debug, Clone, Serialize, Deserialize)]
691pub struct MetricValue {
692 pub timestamp: DateTime<Utc>,
693 pub value: f64,
694 pub labels: HashMap<String, String>,
695}
696
697impl MetricsCollector {
698 pub fn new() -> Self {
699 Self {
700 metrics: HashMap::new(),
701 }
702 }
703
704 pub fn record_metric(&mut self, name: String, value: MetricValue) {
705 self.metrics.entry(name).or_default().push(value);
706 }
707
708 pub fn get_latest_values(&self) -> HashMap<String, f64> {
709 self.metrics
710 .iter()
711 .filter_map(|(name, values)| values.last().map(|v| (name.clone(), v.value)))
712 .collect()
713 }
714}
715
716impl Default for MetricsCollector {
717 fn default() -> Self {
718 Self::new()
719 }
720}
721
722impl EnterpriseMonitoringSystem {
723 pub fn new(config: EnterpriseMonitoringConfig) -> Self {
724 Self {
725 sla_tracker: Arc::new(RwLock::new(SlaTracker::new(config.sla.objectives.clone()))),
726 alert_manager: Arc::new(RwLock::new(AlertManager::new(
727 config.alerting.rules.clone(),
728 ))),
729 metrics_collector: Arc::new(RwLock::new(MetricsCollector::new())),
730 config,
731 }
732 }
733
734 pub async fn initialize(&self) -> Result<()> {
735 if !self.config.enabled {
736 info!("Enterprise monitoring is disabled");
737 return Ok(());
738 }
739
740 info!("Initializing enterprise monitoring system");
741 Ok(())
742 }
743
744 pub async fn record_sla_measurement(&self, measurement: SlaMeasurement) -> Result<()> {
745 let mut tracker = self.sla_tracker.write().await;
746 tracker.record_measurement(measurement);
747
748 let breaches = tracker.check_objectives();
749 if !breaches.is_empty() {
750 warn!("SLA breaches detected: {}", breaches.len());
751 for breach in &breaches {
752 error!(
753 "SLA breach: {} - {} (target: {}, actual: {})",
754 breach.objective_name,
755 breach.metric_type,
756 breach.target_value,
757 breach.actual_value
758 );
759 }
760 }
761
762 Ok(())
763 }
764
765 pub async fn get_sla_status(&self) -> Result<SlaStatus> {
766 let tracker = self.sla_tracker.read().await;
767 Ok(SlaStatus {
768 total_objectives: tracker.objectives.len() as u64,
769 objectives_met: 0, objectives_breached: tracker.breaches.len() as u64,
771 active_breaches: tracker.breaches.iter().filter(|b| !b.resolved).count() as u64,
772 })
773 }
774}
775
776#[derive(Debug, Clone, Serialize, Deserialize)]
777pub struct SlaStatus {
778 pub total_objectives: u64,
779 pub objectives_met: u64,
780 pub objectives_breached: u64,
781 pub active_breaches: u64,
782}
783
784#[cfg(test)]
785mod tests {
786 use super::*;
787
788 #[tokio::test]
789 async fn test_monitoring_config_default() {
790 let config = EnterpriseMonitoringConfig::default();
791 assert!(config.enabled);
792 assert!(config.sla.enabled);
793 }
794
795 #[tokio::test]
796 async fn test_sla_severity_ordering() {
797 assert!(SlaSeverity::Critical > SlaSeverity::High);
798 assert!(SlaSeverity::High > SlaSeverity::Medium);
799 assert!(SlaSeverity::Medium > SlaSeverity::Low);
800 }
801
802 #[tokio::test]
803 async fn test_alert_severity_ordering() {
804 assert!(AlertSeverity::Critical > AlertSeverity::Error);
805 assert!(AlertSeverity::Error > AlertSeverity::Warning);
806 assert!(AlertSeverity::Warning > AlertSeverity::Info);
807 }
808}