1use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::time::{Duration, SystemTime, UNIX_EPOCH};
7use thiserror::Error;
8
9#[derive(Debug, Error)]
10pub enum MonitoringError {
11 #[error("Metric collection error: {0}")]
12 MetricCollection(String),
13 #[error("Exporter error: {0}")]
14 Exporter(String),
15 #[error("Alert manager error: {0}")]
16 AlertManager(String),
17 #[error("Dashboard error: {0}")]
18 Dashboard(String),
19 #[error("Configuration error: {0}")]
20 Configuration(String),
21 #[error("Network error: {0}")]
22 Network(String),
23 #[error("IO error: {0}")]
24 Io(#[from] std::io::Error),
25 #[error("Serialization error: {0}")]
26 Serialization(#[from] serde_json::Error),
27}
28
29#[derive(Debug, Clone, Serialize, Deserialize)]
31pub enum MetricType {
32 Counter,
33 Gauge,
34 Histogram,
35 Summary,
36}
37
38#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct MetricValue {
41 pub value: f64,
42 pub timestamp: u64,
43 pub labels: HashMap<String, String>,
44}
45
46#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct Metric {
49 pub name: String,
50 pub metric_type: MetricType,
51 pub description: String,
52 pub unit: String,
53 pub values: Vec<MetricValue>,
54 pub retention: Duration,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct AlertRule {
60 pub name: String,
61 pub metric_name: String,
62 pub condition: AlertCondition,
63 pub threshold: f64,
64 pub duration: Duration,
65 pub severity: AlertSeverity,
66 pub enabled: bool,
67 pub labels: HashMap<String, String>,
68 pub annotations: HashMap<String, String>,
69}
70
71#[derive(Debug, Clone, Serialize, Deserialize)]
72pub enum AlertCondition {
73 GreaterThan,
74 LessThan,
75 Equal,
76 NotEqual,
77 GreaterThanOrEqual,
78 LessThanOrEqual,
79}
80
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub enum AlertSeverity {
83 Critical,
84 Warning,
85 Info,
86}
87
88#[derive(Debug, Clone, Serialize, Deserialize)]
90pub struct Alert {
91 pub rule_name: String,
92 pub metric_name: String,
93 pub current_value: f64,
94 pub threshold: f64,
95 pub severity: AlertSeverity,
96 pub message: String,
97 pub started_at: u64,
98 pub resolved_at: Option<u64>,
99 pub labels: HashMap<String, String>,
100}
101
102#[derive(Debug, Clone, Serialize, Deserialize)]
104pub struct Dashboard {
105 pub id: String,
106 pub title: String,
107 pub description: String,
108 pub panels: Vec<DashboardPanel>,
109 pub refresh_interval: Duration,
110 pub time_range: TimeRange,
111 pub variables: HashMap<String, String>,
112}
113
114#[derive(Debug, Clone, Serialize, Deserialize)]
115pub struct DashboardPanel {
116 pub id: String,
117 pub title: String,
118 pub panel_type: PanelType,
119 pub metric_queries: Vec<String>,
120 pub position: PanelPosition,
121 pub options: HashMap<String, serde_json::Value>,
122}
123
124#[derive(Debug, Clone, Serialize, Deserialize)]
125pub enum PanelType {
126 LineGraph,
127 BarChart,
128 Gauge,
129 SingleStat,
130 Table,
131 HeatMap,
132 Alert,
133}
134
135#[derive(Debug, Clone, Serialize, Deserialize)]
136pub struct PanelPosition {
137 pub x: u32,
138 pub y: u32,
139 pub width: u32,
140 pub height: u32,
141}
142
143#[derive(Debug, Clone, Serialize, Deserialize)]
144pub struct TimeRange {
145 pub from: String,
146 pub to: String,
147}
148
149#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct MonitoringConfig {
152 pub enabled: bool,
153 pub prometheus: PrometheusConfig,
154 pub grafana: GrafanaConfig,
155 pub alertmanager: AlertManagerConfig,
156 pub custom_exporters: Vec<ExporterConfig>,
157 pub dashboards: Vec<Dashboard>,
158 pub retention_policy: RetentionPolicy,
159}
160
161#[derive(Debug, Clone, Serialize, Deserialize)]
162pub struct PrometheusConfig {
163 pub endpoint: String,
164 pub scrape_interval: Duration,
165 pub evaluation_interval: Duration,
166 pub external_labels: HashMap<String, String>,
167 pub rule_files: Vec<String>,
168}
169
170#[derive(Debug, Clone, Serialize, Deserialize)]
171pub struct GrafanaConfig {
172 pub endpoint: String,
173 pub api_key: String,
174 pub organization: String,
175 pub datasource: String,
176 pub auto_provision: bool,
177}
178
179#[derive(Debug, Clone, Serialize, Deserialize)]
180pub struct AlertManagerConfig {
181 pub endpoint: String,
182 pub receivers: Vec<AlertReceiver>,
183 pub routing: AlertRouting,
184 pub inhibit_rules: Vec<InhibitRule>,
185}
186
187#[derive(Debug, Clone, Serialize, Deserialize)]
188pub struct AlertReceiver {
189 pub name: String,
190 pub webhook_configs: Vec<WebhookConfig>,
191 pub email_configs: Vec<EmailConfig>,
192 pub slack_configs: Vec<SlackConfig>,
193}
194
195#[derive(Debug, Clone, Serialize, Deserialize)]
196pub struct WebhookConfig {
197 pub url: String,
198 pub http_config: Option<HttpConfig>,
199}
200
201#[derive(Debug, Clone, Serialize, Deserialize)]
202pub struct EmailConfig {
203 pub to: Vec<String>,
204 pub from: String,
205 pub subject: String,
206 pub body: String,
207}
208
209#[derive(Debug, Clone, Serialize, Deserialize)]
210pub struct SlackConfig {
211 pub api_url: String,
212 pub channel: String,
213 pub username: String,
214 pub title: String,
215 pub text: String,
216}
217
218#[derive(Debug, Clone, Serialize, Deserialize)]
219pub struct HttpConfig {
220 pub basic_auth: Option<BasicAuth>,
221 pub bearer_token: Option<String>,
222 pub tls_config: Option<TlsConfig>,
223}
224
225#[derive(Debug, Clone, Serialize, Deserialize)]
226pub struct BasicAuth {
227 pub username: String,
228 pub password: String,
229}
230
231#[derive(Debug, Clone, Serialize, Deserialize)]
232pub struct TlsConfig {
233 pub ca_file: Option<String>,
234 pub cert_file: Option<String>,
235 pub key_file: Option<String>,
236 pub insecure_skip_verify: bool,
237}
238
239#[derive(Debug, Clone, Serialize, Deserialize)]
240pub struct AlertRouting {
241 pub group_by: Vec<String>,
242 pub group_wait: Duration,
243 pub group_interval: Duration,
244 pub repeat_interval: Duration,
245 pub receiver: String,
246 pub routes: Vec<Route>,
247}
248
249#[derive(Debug, Clone, Serialize, Deserialize)]
250pub struct Route {
251 pub matchers: HashMap<String, String>,
252 pub receiver: String,
253 pub group_by: Vec<String>,
254 pub continue_route: bool,
255}
256
257#[derive(Debug, Clone, Serialize, Deserialize)]
258pub struct InhibitRule {
259 pub source_matchers: HashMap<String, String>,
260 pub target_matchers: HashMap<String, String>,
261 pub equal: Vec<String>,
262}
263
264#[derive(Debug, Clone, Serialize, Deserialize)]
265pub struct ExporterConfig {
266 pub name: String,
267 pub endpoint: String,
268 pub interval: Duration,
269 pub timeout: Duration,
270 pub labels: HashMap<String, String>,
271}
272
273#[derive(Debug, Clone, Serialize, Deserialize)]
274pub struct RetentionPolicy {
275 pub default_retention: Duration,
276 pub metric_retentions: HashMap<String, Duration>,
277 pub downsampling_rules: Vec<DownsamplingRule>,
278}
279
280#[derive(Debug, Clone, Serialize, Deserialize)]
281pub struct DownsamplingRule {
282 pub resolution: Duration,
283 pub retention: Duration,
284 pub aggregation: AggregationType,
285}
286
287#[derive(Debug, Clone, Serialize, Deserialize)]
288pub enum AggregationType {
289 Average,
290 Sum,
291 Min,
292 Max,
293 Count,
294}
295
296pub struct MonitoringSystem {
298 config: MonitoringConfig,
299 metrics: HashMap<String, Metric>,
300 alert_rules: Vec<AlertRule>,
301 active_alerts: Vec<Alert>,
302 dashboards: Vec<Dashboard>,
303}
304
305impl MonitoringSystem {
306 pub fn new(config: MonitoringConfig) -> Self {
308 Self {
309 config,
310 metrics: HashMap::new(),
311 alert_rules: Vec::new(),
312 active_alerts: Vec::new(),
313 dashboards: Vec::new(),
314 }
315 }
316
317 pub async fn initialize(&mut self) -> Result<(), MonitoringError> {
319 if !self.config.enabled {
320 return Ok(());
321 }
322
323 self.initialize_prometheus().await?;
325
326 self.initialize_grafana().await?;
328
329 self.initialize_alertmanager().await?;
331
332 self.setup_default_metrics().await?;
334
335 self.load_dashboards().await?;
337
338 Ok(())
339 }
340
341 async fn initialize_prometheus(&self) -> Result<(), MonitoringError> {
343 tracing::info!(
345 endpoint = %self.config.prometheus.endpoint,
346 "Initializing Prometheus"
347 );
348 Ok(())
349 }
350
351 async fn initialize_grafana(&self) -> Result<(), MonitoringError> {
353 if self.config.grafana.auto_provision {
354 self.provision_grafana_dashboards().await?;
356 }
357 Ok(())
358 }
359
360 async fn initialize_alertmanager(&self) -> Result<(), MonitoringError> {
362 tracing::info!(
364 endpoint = %self.config.alertmanager.endpoint,
365 "Initializing AlertManager"
366 );
367 Ok(())
368 }
369
370 async fn setup_default_metrics(&mut self) -> Result<(), MonitoringError> {
372 self.register_metric(Metric {
374 name: "auth_requests_total".to_string(),
375 metric_type: MetricType::Counter,
376 description: "Total number of authentication requests".to_string(),
377 unit: "requests".to_string(),
378 values: Vec::new(),
379 retention: Duration::from_secs(30 * 24 * 3600), });
381
382 self.register_metric(Metric {
383 name: "auth_success_total".to_string(),
384 metric_type: MetricType::Counter,
385 description: "Total number of successful authentications".to_string(),
386 unit: "requests".to_string(),
387 values: Vec::new(),
388 retention: Duration::from_secs(30 * 24 * 3600),
389 });
390
391 self.register_metric(Metric {
392 name: "auth_failures_total".to_string(),
393 metric_type: MetricType::Counter,
394 description: "Total number of failed authentications".to_string(),
395 unit: "requests".to_string(),
396 values: Vec::new(),
397 retention: Duration::from_secs(30 * 24 * 3600),
398 });
399
400 self.register_metric(Metric {
402 name: "authz_checks_total".to_string(),
403 metric_type: MetricType::Counter,
404 description: "Total number of authorization checks".to_string(),
405 unit: "checks".to_string(),
406 values: Vec::new(),
407 retention: Duration::from_secs(30 * 24 * 3600),
408 });
409
410 self.register_metric(Metric {
411 name: "authz_denied_total".to_string(),
412 metric_type: MetricType::Counter,
413 description: "Total number of denied authorization checks".to_string(),
414 unit: "checks".to_string(),
415 values: Vec::new(),
416 retention: Duration::from_secs(30 * 24 * 3600),
417 });
418
419 self.register_metric(Metric {
421 name: "request_duration_seconds".to_string(),
422 metric_type: MetricType::Histogram,
423 description: "Request duration in seconds".to_string(),
424 unit: "seconds".to_string(),
425 values: Vec::new(),
426 retention: Duration::from_secs(7 * 24 * 3600), });
428
429 self.register_metric(Metric {
430 name: "active_sessions".to_string(),
431 metric_type: MetricType::Gauge,
432 description: "Number of active user sessions".to_string(),
433 unit: "sessions".to_string(),
434 values: Vec::new(),
435 retention: Duration::from_secs(24 * 3600), });
437
438 Ok(())
439 }
440
441 pub fn register_metric(&mut self, metric: Metric) {
443 self.metrics.insert(metric.name.clone(), metric);
444 }
445
446 pub fn record_metric(
448 &mut self,
449 name: &str,
450 value: f64,
451 labels: HashMap<String, String>,
452 ) -> Result<(), MonitoringError> {
453 if let Some(metric) = self.metrics.get_mut(name) {
454 let now = SystemTime::now()
455 .duration_since(UNIX_EPOCH)
456 .unwrap_or_default();
457
458 let metric_value = MetricValue {
459 value,
460 timestamp: now.as_secs(),
461 labels,
462 };
463
464 metric.values.push(metric_value);
465
466 let cutoff = now.as_secs() - metric.retention.as_secs();
468 metric.values.retain(|v| v.timestamp > cutoff);
469
470 Ok(())
471 } else {
472 Err(MonitoringError::MetricCollection(format!(
473 "Metric not found: {}",
474 name
475 )))
476 }
477 }
478
479 pub fn add_alert_rule(&mut self, rule: AlertRule) {
481 self.alert_rules.push(rule);
482 }
483
484 pub async fn evaluate_alerts(&mut self) -> Result<(), MonitoringError> {
486 let now = SystemTime::now()
487 .duration_since(UNIX_EPOCH)
488 .unwrap_or_default();
489
490 for rule in &self.alert_rules {
491 if !rule.enabled {
492 continue;
493 }
494
495 if let Some(metric) = self.metrics.get(&rule.metric_name)
496 && let Some(latest_value) = metric.values.last()
497 {
498 let should_alert = match rule.condition {
499 AlertCondition::GreaterThan => latest_value.value > rule.threshold,
500 AlertCondition::LessThan => latest_value.value < rule.threshold,
501 AlertCondition::Equal => {
502 (latest_value.value - rule.threshold).abs() < f64::EPSILON
503 }
504 AlertCondition::NotEqual => {
505 (latest_value.value - rule.threshold).abs() > f64::EPSILON
506 }
507 AlertCondition::GreaterThanOrEqual => latest_value.value >= rule.threshold,
508 AlertCondition::LessThanOrEqual => latest_value.value <= rule.threshold,
509 };
510
511 if should_alert {
512 let existing_alert = self
514 .active_alerts
515 .iter()
516 .find(|alert| alert.rule_name == rule.name && alert.resolved_at.is_none());
517
518 if existing_alert.is_none() {
519 let alert = Alert {
520 rule_name: rule.name.clone(),
521 metric_name: rule.metric_name.clone(),
522 current_value: latest_value.value,
523 threshold: rule.threshold,
524 severity: rule.severity.clone(),
525 message: format!(
526 "Alert: {} - Current value: {}, Threshold: {}",
527 rule.name, latest_value.value, rule.threshold
528 ),
529 started_at: now.as_secs(),
530 resolved_at: None,
531 labels: rule.labels.clone(),
532 };
533
534 self.active_alerts.push(alert.clone());
535 self.send_alert(&alert).await?;
536 }
537 } else {
538 let mut alerts_to_resolve = Vec::new();
540 for alert in &mut self.active_alerts {
541 if alert.rule_name == rule.name && alert.resolved_at.is_none() {
542 alert.resolved_at = Some(now.as_secs());
543 alerts_to_resolve.push(alert.clone());
544 }
545 }
546
547 for alert in &alerts_to_resolve {
549 self.send_alert_resolution(alert).await?;
550 }
551 }
552 }
553 }
554 Ok(())
555 }
556
557 async fn send_alert(&self, alert: &Alert) -> Result<(), MonitoringError> {
559 tracing::warn!(rule = %alert.rule_name, message = %alert.message, "ALERT triggered");
561
562 for receiver in &self.config.alertmanager.receivers {
564 self.send_to_receiver(receiver, alert).await?;
565 }
566
567 Ok(())
568 }
569
570 async fn send_alert_resolution(&self, alert: &Alert) -> Result<(), MonitoringError> {
572 tracing::info!(rule = %alert.rule_name, message = %alert.message, "ALERT resolved");
573 Ok(())
574 }
575
576 async fn send_to_receiver(
578 &self,
579 receiver: &AlertReceiver,
580 alert: &Alert,
581 ) -> Result<(), MonitoringError> {
582 for webhook in &receiver.webhook_configs {
584 self.send_webhook_alert(webhook, alert).await?;
585 }
586
587 for email in &receiver.email_configs {
589 self.send_email_alert(email, alert).await?;
590 }
591
592 for slack in &receiver.slack_configs {
594 self.send_slack_alert(slack, alert).await?;
595 }
596
597 Ok(())
598 }
599
600 async fn send_webhook_alert(
602 &self,
603 _webhook: &WebhookConfig,
604 _alert: &Alert,
605 ) -> Result<(), MonitoringError> {
606 Ok(())
608 }
609
610 async fn send_email_alert(
612 &self,
613 _email: &EmailConfig,
614 _alert: &Alert,
615 ) -> Result<(), MonitoringError> {
616 Ok(())
618 }
619
620 async fn send_slack_alert(
622 &self,
623 _slack: &SlackConfig,
624 _alert: &Alert,
625 ) -> Result<(), MonitoringError> {
626 Ok(())
628 }
629
630 async fn load_dashboards(&mut self) -> Result<(), MonitoringError> {
632 self.dashboards = self.config.dashboards.clone();
633 Ok(())
634 }
635
636 async fn provision_grafana_dashboards(&self) -> Result<(), MonitoringError> {
638 for dashboard in &self.config.dashboards {
639 tracing::info!(title = %dashboard.title, "Provisioning Grafana dashboard");
640 }
641 Ok(())
642 }
643
644 pub fn export_prometheus_metrics(&self) -> String {
646 let mut output = String::new();
647
648 for (name, metric) in &self.metrics {
649 output.push_str(&format!("# HELP {} {}\n", name, metric.description));
651 output.push_str(&format!(
652 "# TYPE {} {}\n",
653 name,
654 match metric.metric_type {
655 MetricType::Counter => "counter",
656 MetricType::Gauge => "gauge",
657 MetricType::Histogram => "histogram",
658 MetricType::Summary => "summary",
659 }
660 ));
661
662 for value in &metric.values {
664 let labels = if value.labels.is_empty() {
665 String::new()
666 } else {
667 let label_pairs: Vec<String> = value
668 .labels
669 .iter()
670 .map(|(k, v)| format!("{}=\"{}\"", k, v))
671 .collect();
672 format!("{{{}}}", label_pairs.join(","))
673 };
674
675 output.push_str(&format!(
676 "{}{} {} {}\n",
677 name,
678 labels,
679 value.value,
680 value.timestamp * 1000
681 ));
682 }
683 }
684
685 output
686 }
687
688 pub fn get_metrics(&self) -> &HashMap<String, Metric> {
690 &self.metrics
691 }
692
693 pub fn get_active_alerts(&self) -> &Vec<Alert> {
695 &self.active_alerts
696 }
697
698 pub fn get_dashboards(&self) -> &Vec<Dashboard> {
700 &self.dashboards
701 }
702}
703
704impl Default for MonitoringConfig {
705 fn default() -> Self {
706 Self {
707 enabled: true,
708 prometheus: PrometheusConfig {
709 endpoint: "http://localhost:9090".to_string(),
710 scrape_interval: Duration::from_secs(15),
711 evaluation_interval: Duration::from_secs(15),
712 external_labels: HashMap::new(),
713 rule_files: vec!["alerts.yml".to_string()],
714 },
715 grafana: GrafanaConfig {
716 endpoint: "http://localhost:3000".to_string(),
717 api_key: "".to_string(),
718 organization: "Main Org.".to_string(),
719 datasource: "Prometheus".to_string(),
720 auto_provision: true,
721 },
722 alertmanager: AlertManagerConfig {
723 endpoint: "http://localhost:9093".to_string(),
724 receivers: vec![AlertReceiver {
725 name: "default".to_string(),
726 webhook_configs: Vec::new(),
727 email_configs: Vec::new(),
728 slack_configs: Vec::new(),
729 }],
730 routing: AlertRouting {
731 group_by: vec!["alertname".to_string()],
732 group_wait: Duration::from_secs(10),
733 group_interval: Duration::from_secs(10),
734 repeat_interval: Duration::from_secs(3600),
735 receiver: "default".to_string(),
736 routes: Vec::new(),
737 },
738 inhibit_rules: Vec::new(),
739 },
740 custom_exporters: Vec::new(),
741 dashboards: Vec::new(),
742 retention_policy: RetentionPolicy {
743 default_retention: Duration::from_secs(30 * 24 * 3600), metric_retentions: HashMap::new(),
745 downsampling_rules: Vec::new(),
746 },
747 }
748 }
749}
750
751#[cfg(test)]
752mod tests {
753 use super::*;
754
755 #[test]
756 fn test_monitoring_system_creation() {
757 let config = MonitoringConfig::default();
758 let system = MonitoringSystem::new(config);
759
760 assert!(system.metrics.is_empty());
761 assert!(system.alert_rules.is_empty());
762 assert!(system.active_alerts.is_empty());
763 }
764
765 #[tokio::test]
766 async fn test_metric_registration() {
767 let config = MonitoringConfig::default();
768 let mut system = MonitoringSystem::new(config);
769
770 let metric = Metric {
771 name: "test_metric".to_string(),
772 metric_type: MetricType::Counter,
773 description: "Test metric".to_string(),
774 unit: "count".to_string(),
775 values: Vec::new(),
776 retention: Duration::from_secs(3600),
777 };
778
779 system.register_metric(metric);
780 assert!(system.metrics.contains_key("test_metric"));
781 }
782
783 #[tokio::test]
784 async fn test_metric_recording() {
785 let config = MonitoringConfig::default();
786 let mut system = MonitoringSystem::new(config);
787
788 let metric = Metric {
789 name: "test_counter".to_string(),
790 metric_type: MetricType::Counter,
791 description: "Test counter".to_string(),
792 unit: "count".to_string(),
793 values: Vec::new(),
794 retention: Duration::from_secs(3600),
795 };
796
797 system.register_metric(metric);
798
799 let mut labels = HashMap::new();
800 labels.insert("service".to_string(), "auth".to_string());
801
802 let result = system.record_metric("test_counter", 1.0, labels);
803 assert!(result.is_ok());
804
805 let metric = system.metrics.get("test_counter").unwrap();
806 assert_eq!(metric.values.len(), 1);
807 assert_eq!(metric.values[0].value, 1.0);
808 }
809
810 #[test]
811 fn test_prometheus_export() {
812 let config = MonitoringConfig::default();
813 let mut system = MonitoringSystem::new(config);
814
815 let metric = Metric {
816 name: "test_gauge".to_string(),
817 metric_type: MetricType::Gauge,
818 description: "Test gauge".to_string(),
819 unit: "value".to_string(),
820 values: vec![MetricValue {
821 value: 42.0,
822 timestamp: 1640995200,
823 labels: HashMap::new(),
824 }],
825 retention: Duration::from_secs(3600),
826 };
827
828 system.register_metric(metric);
829
830 let export = system.export_prometheus_metrics();
831 assert!(export.contains("# HELP test_gauge Test gauge"));
832 assert!(export.contains("# TYPE test_gauge gauge"));
833 assert!(export.contains("test_gauge 42"));
834 }
835}