1use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::time::{Duration, SystemTime, UNIX_EPOCH};
7use thiserror::Error;
8
9#[derive(Debug, Error)]
10pub enum MonitoringError {
11 #[error("Metric collection error: {0}")]
12 MetricCollection(String),
13 #[error("Exporter error: {0}")]
14 Exporter(String),
15 #[error("Alert manager error: {0}")]
16 AlertManager(String),
17 #[error("Dashboard error: {0}")]
18 Dashboard(String),
19 #[error("Configuration error: {0}")]
20 Configuration(String),
21 #[error("Network error: {0}")]
22 Network(String),
23 #[error("IO error: {0}")]
24 Io(#[from] std::io::Error),
25 #[error("Serialization error: {0}")]
26 Serialization(#[from] serde_json::Error),
27}
28
29#[derive(Debug, Clone, Serialize, Deserialize)]
31pub enum MetricType {
32 Counter,
33 Gauge,
34 Histogram,
35 Summary,
36}
37
38#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct MetricValue {
41 pub value: f64,
42 pub timestamp: u64,
43 pub labels: HashMap<String, String>,
44}
45
46#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct Metric {
49 pub name: String,
50 pub metric_type: MetricType,
51 pub description: String,
52 pub unit: String,
53 pub values: Vec<MetricValue>,
54 pub retention: Duration,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct AlertRule {
60 pub name: String,
61 pub metric_name: String,
62 pub condition: AlertCondition,
63 pub threshold: f64,
64 pub duration: Duration,
65 pub severity: AlertSeverity,
66 pub enabled: bool,
67 pub labels: HashMap<String, String>,
68 pub annotations: HashMap<String, String>,
69}
70
71#[derive(Debug, Clone, Serialize, Deserialize)]
72pub enum AlertCondition {
73 GreaterThan,
74 LessThan,
75 Equal,
76 NotEqual,
77 GreaterThanOrEqual,
78 LessThanOrEqual,
79}
80
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub enum AlertSeverity {
83 Critical,
84 Warning,
85 Info,
86}
87
88#[derive(Debug, Clone, Serialize, Deserialize)]
90pub struct Alert {
91 pub rule_name: String,
92 pub metric_name: String,
93 pub current_value: f64,
94 pub threshold: f64,
95 pub severity: AlertSeverity,
96 pub message: String,
97 pub started_at: u64,
98 pub resolved_at: Option<u64>,
99 pub labels: HashMap<String, String>,
100}
101
102#[derive(Debug, Clone, Serialize, Deserialize)]
104pub struct Dashboard {
105 pub id: String,
106 pub title: String,
107 pub description: String,
108 pub panels: Vec<DashboardPanel>,
109 pub refresh_interval: Duration,
110 pub time_range: TimeRange,
111 pub variables: HashMap<String, String>,
112}
113
114#[derive(Debug, Clone, Serialize, Deserialize)]
115pub struct DashboardPanel {
116 pub id: String,
117 pub title: String,
118 pub panel_type: PanelType,
119 pub metric_queries: Vec<String>,
120 pub position: PanelPosition,
121 pub options: HashMap<String, serde_json::Value>,
122}
123
124#[derive(Debug, Clone, Serialize, Deserialize)]
125pub enum PanelType {
126 LineGraph,
127 BarChart,
128 Gauge,
129 SingleStat,
130 Table,
131 HeatMap,
132 Alert,
133}
134
135#[derive(Debug, Clone, Serialize, Deserialize)]
136pub struct PanelPosition {
137 pub x: u32,
138 pub y: u32,
139 pub width: u32,
140 pub height: u32,
141}
142
143#[derive(Debug, Clone, Serialize, Deserialize)]
144pub struct TimeRange {
145 pub from: String,
146 pub to: String,
147}
148
149#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct MonitoringConfig {
152 pub enabled: bool,
153 pub prometheus: PrometheusConfig,
154 pub grafana: GrafanaConfig,
155 pub alertmanager: AlertManagerConfig,
156 pub custom_exporters: Vec<ExporterConfig>,
157 pub dashboards: Vec<Dashboard>,
158 pub retention_policy: RetentionPolicy,
159}
160
161#[derive(Debug, Clone, Serialize, Deserialize)]
162pub struct PrometheusConfig {
163 pub endpoint: String,
164 pub scrape_interval: Duration,
165 pub evaluation_interval: Duration,
166 pub external_labels: HashMap<String, String>,
167 pub rule_files: Vec<String>,
168}
169
170#[derive(Debug, Clone, Serialize, Deserialize)]
171pub struct GrafanaConfig {
172 pub endpoint: String,
173 pub api_key: String,
174 pub organization: String,
175 pub datasource: String,
176 pub auto_provision: bool,
177}
178
179#[derive(Debug, Clone, Serialize, Deserialize)]
180pub struct AlertManagerConfig {
181 pub endpoint: String,
182 pub receivers: Vec<AlertReceiver>,
183 pub routing: AlertRouting,
184 pub inhibit_rules: Vec<InhibitRule>,
185}
186
187#[derive(Debug, Clone, Serialize, Deserialize)]
188pub struct AlertReceiver {
189 pub name: String,
190 pub webhook_configs: Vec<WebhookConfig>,
191 pub email_configs: Vec<EmailConfig>,
192 pub slack_configs: Vec<SlackConfig>,
193}
194
195#[derive(Debug, Clone, Serialize, Deserialize)]
196pub struct WebhookConfig {
197 pub url: String,
198 pub http_config: Option<HttpConfig>,
199}
200
201#[derive(Debug, Clone, Serialize, Deserialize)]
202pub struct EmailConfig {
203 pub to: Vec<String>,
204 pub from: String,
205 pub subject: String,
206 pub body: String,
207}
208
209#[derive(Debug, Clone, Serialize, Deserialize)]
210pub struct SlackConfig {
211 pub api_url: String,
212 pub channel: String,
213 pub username: String,
214 pub title: String,
215 pub text: String,
216}
217
218#[derive(Debug, Clone, Serialize, Deserialize)]
219pub struct HttpConfig {
220 pub basic_auth: Option<BasicAuth>,
221 pub bearer_token: Option<String>,
222 pub tls_config: Option<TlsConfig>,
223}
224
225#[derive(Debug, Clone, Serialize, Deserialize)]
226pub struct BasicAuth {
227 pub username: String,
228 pub password: String,
229}
230
231#[derive(Debug, Clone, Serialize, Deserialize)]
232pub struct TlsConfig {
233 pub ca_file: Option<String>,
234 pub cert_file: Option<String>,
235 pub key_file: Option<String>,
236 pub insecure_skip_verify: bool,
237}
238
239#[derive(Debug, Clone, Serialize, Deserialize)]
240pub struct AlertRouting {
241 pub group_by: Vec<String>,
242 pub group_wait: Duration,
243 pub group_interval: Duration,
244 pub repeat_interval: Duration,
245 pub receiver: String,
246 pub routes: Vec<Route>,
247}
248
249#[derive(Debug, Clone, Serialize, Deserialize)]
250pub struct Route {
251 pub matchers: HashMap<String, String>,
252 pub receiver: String,
253 pub group_by: Vec<String>,
254 pub continue_route: bool,
255}
256
257#[derive(Debug, Clone, Serialize, Deserialize)]
258pub struct InhibitRule {
259 pub source_matchers: HashMap<String, String>,
260 pub target_matchers: HashMap<String, String>,
261 pub equal: Vec<String>,
262}
263
264#[derive(Debug, Clone, Serialize, Deserialize)]
265pub struct ExporterConfig {
266 pub name: String,
267 pub endpoint: String,
268 pub interval: Duration,
269 pub timeout: Duration,
270 pub labels: HashMap<String, String>,
271}
272
273#[derive(Debug, Clone, Serialize, Deserialize)]
274pub struct RetentionPolicy {
275 pub default_retention: Duration,
276 pub metric_retentions: HashMap<String, Duration>,
277 pub downsampling_rules: Vec<DownsamplingRule>,
278}
279
280#[derive(Debug, Clone, Serialize, Deserialize)]
281pub struct DownsamplingRule {
282 pub resolution: Duration,
283 pub retention: Duration,
284 pub aggregation: AggregationType,
285}
286
287#[derive(Debug, Clone, Serialize, Deserialize)]
288pub enum AggregationType {
289 Average,
290 Sum,
291 Min,
292 Max,
293 Count,
294}
295
296pub struct MonitoringSystem {
298 config: MonitoringConfig,
299 metrics: HashMap<String, Metric>,
300 alert_rules: Vec<AlertRule>,
301 active_alerts: Vec<Alert>,
302 dashboards: Vec<Dashboard>,
303}
304
305impl MonitoringSystem {
306 pub fn new(config: MonitoringConfig) -> Self {
308 Self {
309 config,
310 metrics: HashMap::new(),
311 alert_rules: Vec::new(),
312 active_alerts: Vec::new(),
313 dashboards: Vec::new(),
314 }
315 }
316
317 pub async fn initialize(&mut self) -> Result<(), MonitoringError> {
319 if !self.config.enabled {
320 return Ok(());
321 }
322
323 self.initialize_prometheus().await?;
325
326 self.initialize_grafana().await?;
328
329 self.initialize_alertmanager().await?;
331
332 self.setup_default_metrics().await?;
334
335 self.load_dashboards().await?;
337
338 Ok(())
339 }
340
341 async fn initialize_prometheus(&self) -> Result<(), MonitoringError> {
343 println!(
345 "Initializing Prometheus at {}",
346 self.config.prometheus.endpoint
347 );
348 Ok(())
349 }
350
351 async fn initialize_grafana(&self) -> Result<(), MonitoringError> {
353 if self.config.grafana.auto_provision {
354 self.provision_grafana_dashboards().await?;
356 }
357 Ok(())
358 }
359
360 async fn initialize_alertmanager(&self) -> Result<(), MonitoringError> {
362 println!(
364 "Initializing AlertManager at {}",
365 self.config.alertmanager.endpoint
366 );
367 Ok(())
368 }
369
370 async fn setup_default_metrics(&mut self) -> Result<(), MonitoringError> {
372 self.register_metric(Metric {
374 name: "auth_requests_total".to_string(),
375 metric_type: MetricType::Counter,
376 description: "Total number of authentication requests".to_string(),
377 unit: "requests".to_string(),
378 values: Vec::new(),
379 retention: Duration::from_secs(30 * 24 * 3600), });
381
382 self.register_metric(Metric {
383 name: "auth_success_total".to_string(),
384 metric_type: MetricType::Counter,
385 description: "Total number of successful authentications".to_string(),
386 unit: "requests".to_string(),
387 values: Vec::new(),
388 retention: Duration::from_secs(30 * 24 * 3600),
389 });
390
391 self.register_metric(Metric {
392 name: "auth_failures_total".to_string(),
393 metric_type: MetricType::Counter,
394 description: "Total number of failed authentications".to_string(),
395 unit: "requests".to_string(),
396 values: Vec::new(),
397 retention: Duration::from_secs(30 * 24 * 3600),
398 });
399
400 self.register_metric(Metric {
402 name: "authz_checks_total".to_string(),
403 metric_type: MetricType::Counter,
404 description: "Total number of authorization checks".to_string(),
405 unit: "checks".to_string(),
406 values: Vec::new(),
407 retention: Duration::from_secs(30 * 24 * 3600),
408 });
409
410 self.register_metric(Metric {
411 name: "authz_denied_total".to_string(),
412 metric_type: MetricType::Counter,
413 description: "Total number of denied authorization checks".to_string(),
414 unit: "checks".to_string(),
415 values: Vec::new(),
416 retention: Duration::from_secs(30 * 24 * 3600),
417 });
418
419 self.register_metric(Metric {
421 name: "request_duration_seconds".to_string(),
422 metric_type: MetricType::Histogram,
423 description: "Request duration in seconds".to_string(),
424 unit: "seconds".to_string(),
425 values: Vec::new(),
426 retention: Duration::from_secs(7 * 24 * 3600), });
428
429 self.register_metric(Metric {
430 name: "active_sessions".to_string(),
431 metric_type: MetricType::Gauge,
432 description: "Number of active user sessions".to_string(),
433 unit: "sessions".to_string(),
434 values: Vec::new(),
435 retention: Duration::from_secs(24 * 3600), });
437
438 Ok(())
439 }
440
441 pub fn register_metric(&mut self, metric: Metric) {
443 self.metrics.insert(metric.name.clone(), metric);
444 }
445
446 pub fn record_metric(
448 &mut self,
449 name: &str,
450 value: f64,
451 labels: HashMap<String, String>,
452 ) -> Result<(), MonitoringError> {
453 if let Some(metric) = self.metrics.get_mut(name) {
454 let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap();
455
456 let metric_value = MetricValue {
457 value,
458 timestamp: now.as_secs(),
459 labels,
460 };
461
462 metric.values.push(metric_value);
463
464 let cutoff = now.as_secs() - metric.retention.as_secs();
466 metric.values.retain(|v| v.timestamp > cutoff);
467
468 Ok(())
469 } else {
470 Err(MonitoringError::MetricCollection(format!(
471 "Metric not found: {}",
472 name
473 )))
474 }
475 }
476
477 pub fn add_alert_rule(&mut self, rule: AlertRule) {
479 self.alert_rules.push(rule);
480 }
481
482 pub async fn evaluate_alerts(&mut self) -> Result<(), MonitoringError> {
484 let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap();
485
486 for rule in &self.alert_rules {
487 if !rule.enabled {
488 continue;
489 }
490
491 if let Some(metric) = self.metrics.get(&rule.metric_name)
492 && let Some(latest_value) = metric.values.last()
493 {
494 let should_alert = match rule.condition {
495 AlertCondition::GreaterThan => latest_value.value > rule.threshold,
496 AlertCondition::LessThan => latest_value.value < rule.threshold,
497 AlertCondition::Equal => {
498 (latest_value.value - rule.threshold).abs() < f64::EPSILON
499 }
500 AlertCondition::NotEqual => {
501 (latest_value.value - rule.threshold).abs() > f64::EPSILON
502 }
503 AlertCondition::GreaterThanOrEqual => latest_value.value >= rule.threshold,
504 AlertCondition::LessThanOrEqual => latest_value.value <= rule.threshold,
505 };
506
507 if should_alert {
508 let existing_alert = self
510 .active_alerts
511 .iter()
512 .find(|alert| alert.rule_name == rule.name && alert.resolved_at.is_none());
513
514 if existing_alert.is_none() {
515 let alert = Alert {
516 rule_name: rule.name.clone(),
517 metric_name: rule.metric_name.clone(),
518 current_value: latest_value.value,
519 threshold: rule.threshold,
520 severity: rule.severity.clone(),
521 message: format!(
522 "Alert: {} - Current value: {}, Threshold: {}",
523 rule.name, latest_value.value, rule.threshold
524 ),
525 started_at: now.as_secs(),
526 resolved_at: None,
527 labels: rule.labels.clone(),
528 };
529
530 self.active_alerts.push(alert.clone());
531 self.send_alert(&alert).await?;
532 }
533 } else {
534 let mut alerts_to_resolve = Vec::new();
536 for alert in &mut self.active_alerts {
537 if alert.rule_name == rule.name && alert.resolved_at.is_none() {
538 alert.resolved_at = Some(now.as_secs());
539 alerts_to_resolve.push(alert.clone());
540 }
541 }
542
543 for alert in &alerts_to_resolve {
545 self.send_alert_resolution(alert).await?;
546 }
547 }
548 }
549 }
550 Ok(())
551 }
552
553 async fn send_alert(&self, alert: &Alert) -> Result<(), MonitoringError> {
555 println!("ALERT: {} - {}", alert.rule_name, alert.message);
557
558 for receiver in &self.config.alertmanager.receivers {
560 self.send_to_receiver(receiver, alert).await?;
561 }
562
563 Ok(())
564 }
565
566 async fn send_alert_resolution(&self, alert: &Alert) -> Result<(), MonitoringError> {
568 println!("ALERT RESOLVED: {} - {}", alert.rule_name, alert.message);
569 Ok(())
570 }
571
572 async fn send_to_receiver(
574 &self,
575 receiver: &AlertReceiver,
576 alert: &Alert,
577 ) -> Result<(), MonitoringError> {
578 for webhook in &receiver.webhook_configs {
580 self.send_webhook_alert(webhook, alert).await?;
581 }
582
583 for email in &receiver.email_configs {
585 self.send_email_alert(email, alert).await?;
586 }
587
588 for slack in &receiver.slack_configs {
590 self.send_slack_alert(slack, alert).await?;
591 }
592
593 Ok(())
594 }
595
596 async fn send_webhook_alert(
598 &self,
599 _webhook: &WebhookConfig,
600 _alert: &Alert,
601 ) -> Result<(), MonitoringError> {
602 Ok(())
604 }
605
606 async fn send_email_alert(
608 &self,
609 _email: &EmailConfig,
610 _alert: &Alert,
611 ) -> Result<(), MonitoringError> {
612 Ok(())
614 }
615
616 async fn send_slack_alert(
618 &self,
619 _slack: &SlackConfig,
620 _alert: &Alert,
621 ) -> Result<(), MonitoringError> {
622 Ok(())
624 }
625
626 async fn load_dashboards(&mut self) -> Result<(), MonitoringError> {
628 self.dashboards = self.config.dashboards.clone();
629 Ok(())
630 }
631
632 async fn provision_grafana_dashboards(&self) -> Result<(), MonitoringError> {
634 for dashboard in &self.config.dashboards {
635 println!("Provisioning Grafana dashboard: {}", dashboard.title);
636 }
637 Ok(())
638 }
639
640 pub fn export_prometheus_metrics(&self) -> String {
642 let mut output = String::new();
643
644 for (name, metric) in &self.metrics {
645 output.push_str(&format!("# HELP {} {}\n", name, metric.description));
647 output.push_str(&format!(
648 "# TYPE {} {}\n",
649 name,
650 match metric.metric_type {
651 MetricType::Counter => "counter",
652 MetricType::Gauge => "gauge",
653 MetricType::Histogram => "histogram",
654 MetricType::Summary => "summary",
655 }
656 ));
657
658 for value in &metric.values {
660 let labels = if value.labels.is_empty() {
661 String::new()
662 } else {
663 let label_pairs: Vec<String> = value
664 .labels
665 .iter()
666 .map(|(k, v)| format!("{}=\"{}\"", k, v))
667 .collect();
668 format!("{{{}}}", label_pairs.join(","))
669 };
670
671 output.push_str(&format!(
672 "{}{} {} {}\n",
673 name,
674 labels,
675 value.value,
676 value.timestamp * 1000
677 ));
678 }
679 }
680
681 output
682 }
683
684 pub fn get_metrics(&self) -> &HashMap<String, Metric> {
686 &self.metrics
687 }
688
689 pub fn get_active_alerts(&self) -> &Vec<Alert> {
691 &self.active_alerts
692 }
693
694 pub fn get_dashboards(&self) -> &Vec<Dashboard> {
696 &self.dashboards
697 }
698}
699
700impl Default for MonitoringConfig {
701 fn default() -> Self {
702 Self {
703 enabled: true,
704 prometheus: PrometheusConfig {
705 endpoint: "http://localhost:9090".to_string(),
706 scrape_interval: Duration::from_secs(15),
707 evaluation_interval: Duration::from_secs(15),
708 external_labels: HashMap::new(),
709 rule_files: vec!["alerts.yml".to_string()],
710 },
711 grafana: GrafanaConfig {
712 endpoint: "http://localhost:3000".to_string(),
713 api_key: "".to_string(),
714 organization: "Main Org.".to_string(),
715 datasource: "Prometheus".to_string(),
716 auto_provision: true,
717 },
718 alertmanager: AlertManagerConfig {
719 endpoint: "http://localhost:9093".to_string(),
720 receivers: vec![AlertReceiver {
721 name: "default".to_string(),
722 webhook_configs: Vec::new(),
723 email_configs: Vec::new(),
724 slack_configs: Vec::new(),
725 }],
726 routing: AlertRouting {
727 group_by: vec!["alertname".to_string()],
728 group_wait: Duration::from_secs(10),
729 group_interval: Duration::from_secs(10),
730 repeat_interval: Duration::from_secs(3600),
731 receiver: "default".to_string(),
732 routes: Vec::new(),
733 },
734 inhibit_rules: Vec::new(),
735 },
736 custom_exporters: Vec::new(),
737 dashboards: Vec::new(),
738 retention_policy: RetentionPolicy {
739 default_retention: Duration::from_secs(30 * 24 * 3600), metric_retentions: HashMap::new(),
741 downsampling_rules: Vec::new(),
742 },
743 }
744 }
745}
746
747#[cfg(test)]
748mod tests {
749 use super::*;
750
751 #[test]
752 fn test_monitoring_system_creation() {
753 let config = MonitoringConfig::default();
754 let system = MonitoringSystem::new(config);
755
756 assert!(system.metrics.is_empty());
757 assert!(system.alert_rules.is_empty());
758 assert!(system.active_alerts.is_empty());
759 }
760
761 #[tokio::test]
762 async fn test_metric_registration() {
763 let config = MonitoringConfig::default();
764 let mut system = MonitoringSystem::new(config);
765
766 let metric = Metric {
767 name: "test_metric".to_string(),
768 metric_type: MetricType::Counter,
769 description: "Test metric".to_string(),
770 unit: "count".to_string(),
771 values: Vec::new(),
772 retention: Duration::from_secs(3600),
773 };
774
775 system.register_metric(metric);
776 assert!(system.metrics.contains_key("test_metric"));
777 }
778
779 #[tokio::test]
780 async fn test_metric_recording() {
781 let config = MonitoringConfig::default();
782 let mut system = MonitoringSystem::new(config);
783
784 let metric = Metric {
785 name: "test_counter".to_string(),
786 metric_type: MetricType::Counter,
787 description: "Test counter".to_string(),
788 unit: "count".to_string(),
789 values: Vec::new(),
790 retention: Duration::from_secs(3600),
791 };
792
793 system.register_metric(metric);
794
795 let mut labels = HashMap::new();
796 labels.insert("service".to_string(), "auth".to_string());
797
798 let result = system.record_metric("test_counter", 1.0, labels);
799 assert!(result.is_ok());
800
801 let metric = system.metrics.get("test_counter").unwrap();
802 assert_eq!(metric.values.len(), 1);
803 assert_eq!(metric.values[0].value, 1.0);
804 }
805
806 #[test]
807 fn test_prometheus_export() {
808 let config = MonitoringConfig::default();
809 let mut system = MonitoringSystem::new(config);
810
811 let metric = Metric {
812 name: "test_gauge".to_string(),
813 metric_type: MetricType::Gauge,
814 description: "Test gauge".to_string(),
815 unit: "value".to_string(),
816 values: vec![MetricValue {
817 value: 42.0,
818 timestamp: 1640995200,
819 labels: HashMap::new(),
820 }],
821 retention: Duration::from_secs(3600),
822 };
823
824 system.register_metric(metric);
825
826 let export = system.export_prometheus_metrics();
827 assert!(export.contains("# HELP test_gauge Test gauge"));
828 assert!(export.contains("# TYPE test_gauge gauge"));
829 assert!(export.contains("test_gauge 42"));
830 }
831}
832
833