1use chrono::{DateTime, Utc};
12use serde::{Deserialize, Serialize};
13use std::collections::HashMap;
14use std::time::Duration;
15
16#[derive(
18 Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Serialize, Deserialize,
19)]
20#[serde(rename_all = "lowercase")]
21pub enum AlertSeverity {
22 Low,
24 #[default]
26 Medium,
27 High,
29 Critical,
31}
32
33impl std::fmt::Display for AlertSeverity {
34 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
35 match self {
36 AlertSeverity::Low => write!(f, "low"),
37 AlertSeverity::Medium => write!(f, "medium"),
38 AlertSeverity::High => write!(f, "high"),
39 AlertSeverity::Critical => write!(f, "critical"),
40 }
41 }
42}
43
44#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
46#[serde(rename_all = "snake_case")]
47pub enum AlertType {
48 Timeout,
50 CostThreshold,
52 ErrorRate,
54 Latency,
56 ResourceLimit,
58 Custom(String),
60}
61
62impl std::fmt::Display for AlertType {
63 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
64 match self {
65 AlertType::Timeout => write!(f, "timeout"),
66 AlertType::CostThreshold => write!(f, "cost_threshold"),
67 AlertType::ErrorRate => write!(f, "error_rate"),
68 AlertType::Latency => write!(f, "latency"),
69 AlertType::ResourceLimit => write!(f, "resource_limit"),
70 AlertType::Custom(name) => write!(f, "custom:{}", name),
71 }
72 }
73}
74
75#[derive(Debug, Clone, Serialize, Deserialize)]
77#[serde(rename_all = "camelCase")]
78pub struct Alert {
79 pub id: String,
81 pub alert_type: AlertType,
83 pub severity: AlertSeverity,
85 pub agent_id: String,
87 pub message: String,
89 pub timestamp: DateTime<Utc>,
91 pub acknowledged: bool,
93 pub metadata: Option<HashMap<String, serde_json::Value>>,
95}
96
97impl Alert {
98 pub fn new(
100 alert_type: AlertType,
101 severity: AlertSeverity,
102 agent_id: impl Into<String>,
103 message: impl Into<String>,
104 ) -> Self {
105 Self {
106 id: uuid::Uuid::new_v4().to_string(),
107 alert_type,
108 severity,
109 agent_id: agent_id.into(),
110 message: message.into(),
111 timestamp: Utc::now(),
112 acknowledged: false,
113 metadata: None,
114 }
115 }
116
117 pub fn timeout(agent_id: impl Into<String>, duration: Duration, timeout: Duration) -> Self {
119 let agent_id = agent_id.into();
120 let severity = if duration > timeout * 2 {
121 AlertSeverity::Critical
122 } else {
123 AlertSeverity::High
124 };
125
126 let mut alert = Self::new(
127 AlertType::Timeout,
128 severity,
129 agent_id.clone(),
130 format!(
131 "Agent {} exceeded timeout: {:?} > {:?}",
132 agent_id, duration, timeout
133 ),
134 );
135
136 let mut metadata = HashMap::new();
137 metadata.insert(
138 "duration_ms".to_string(),
139 serde_json::json!(duration.as_millis()),
140 );
141 metadata.insert(
142 "timeout_ms".to_string(),
143 serde_json::json!(timeout.as_millis()),
144 );
145 alert.metadata = Some(metadata);
146
147 alert
148 }
149
150 pub fn cost_threshold(agent_id: impl Into<String>, cost: f64, threshold: f64) -> Self {
152 let agent_id = agent_id.into();
153 let ratio = cost / threshold;
154 let severity = if ratio >= 2.0 {
155 AlertSeverity::Critical
156 } else if ratio >= 1.5 {
157 AlertSeverity::High
158 } else {
159 AlertSeverity::Medium
160 };
161
162 let mut alert = Self::new(
163 AlertType::CostThreshold,
164 severity,
165 agent_id.clone(),
166 format!(
167 "Agent {} exceeded cost threshold: ${:.4} > ${:.4}",
168 agent_id, cost, threshold
169 ),
170 );
171
172 let mut metadata = HashMap::new();
173 metadata.insert("cost".to_string(), serde_json::json!(cost));
174 metadata.insert("threshold".to_string(), serde_json::json!(threshold));
175 metadata.insert("ratio".to_string(), serde_json::json!(ratio));
176 alert.metadata = Some(metadata);
177
178 alert
179 }
180
181 pub fn error_rate(agent_id: impl Into<String>, error_rate: f32, threshold: f32) -> Self {
183 let agent_id = agent_id.into();
184 let severity = if error_rate >= 0.75 {
185 AlertSeverity::Critical
186 } else if error_rate >= 0.5 {
187 AlertSeverity::High
188 } else if error_rate >= threshold {
189 AlertSeverity::Medium
190 } else {
191 AlertSeverity::Low
192 };
193
194 let mut alert = Self::new(
195 AlertType::ErrorRate,
196 severity,
197 agent_id.clone(),
198 format!(
199 "Agent {} exceeded error rate threshold: {:.1}% > {:.1}%",
200 agent_id,
201 error_rate * 100.0,
202 threshold * 100.0
203 ),
204 );
205
206 let mut metadata = HashMap::new();
207 metadata.insert("error_rate".to_string(), serde_json::json!(error_rate));
208 metadata.insert("threshold".to_string(), serde_json::json!(threshold));
209 alert.metadata = Some(metadata);
210
211 alert
212 }
213
214 pub fn latency(agent_id: impl Into<String>, latency: Duration, threshold: Duration) -> Self {
216 let agent_id = agent_id.into();
217 let ratio = latency.as_millis() as f64 / threshold.as_millis() as f64;
218 let severity = if ratio >= 3.0 {
219 AlertSeverity::Critical
220 } else if ratio >= 2.0 {
221 AlertSeverity::High
222 } else {
223 AlertSeverity::Medium
224 };
225
226 let mut alert = Self::new(
227 AlertType::Latency,
228 severity,
229 agent_id.clone(),
230 format!(
231 "Agent {} exceeded latency threshold: {:?} > {:?}",
232 agent_id, latency, threshold
233 ),
234 );
235
236 let mut metadata = HashMap::new();
237 metadata.insert(
238 "latency_ms".to_string(),
239 serde_json::json!(latency.as_millis()),
240 );
241 metadata.insert(
242 "threshold_ms".to_string(),
243 serde_json::json!(threshold.as_millis()),
244 );
245 alert.metadata = Some(metadata);
246
247 alert
248 }
249
250 pub fn with_metadata(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
252 let metadata = self.metadata.get_or_insert_with(HashMap::new);
253 metadata.insert(key.into(), value);
254 self
255 }
256
257 pub fn acknowledge(&mut self) {
259 self.acknowledged = true;
260 }
261
262 pub fn is_active(&self) -> bool {
264 !self.acknowledged
265 }
266}
267
268impl PartialEq for Alert {
269 fn eq(&self, other: &Self) -> bool {
270 self.id == other.id
271 }
272}
273
274impl Eq for Alert {}
275
276impl std::hash::Hash for Alert {
277 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
278 self.id.hash(state);
279 }
280}
281
282#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
284#[serde(rename_all = "snake_case")]
285pub enum AgentExecutionStatus {
286 #[default]
288 Running,
289 Completed,
291 Failed,
293 Cancelled,
295 TimedOut,
297}
298
299#[derive(Debug, Clone, Default, Serialize, Deserialize)]
301#[serde(rename_all = "camelCase")]
302pub struct TokenUsage {
303 pub input: usize,
305 pub output: usize,
307 pub total: usize,
309}
310
311impl TokenUsage {
312 pub fn new(input: usize, output: usize) -> Self {
314 Self {
315 input,
316 output,
317 total: input + output,
318 }
319 }
320}
321
322#[derive(Debug, Clone, Serialize, Deserialize)]
324#[serde(rename_all = "camelCase")]
325pub struct ErrorRecord {
326 pub message: String,
328 pub timestamp: DateTime<Utc>,
330 pub phase: Option<String>,
332 pub stack_trace: Option<String>,
334}
335
336impl ErrorRecord {
337 pub fn new(message: impl Into<String>) -> Self {
339 Self {
340 message: message.into(),
341 timestamp: Utc::now(),
342 phase: None,
343 stack_trace: None,
344 }
345 }
346
347 pub fn with_phase(mut self, phase: impl Into<String>) -> Self {
349 self.phase = Some(phase.into());
350 self
351 }
352}
353
354#[derive(Debug, Clone, Serialize, Deserialize)]
356#[serde(rename_all = "camelCase")]
357pub struct AgentMetrics {
358 pub agent_id: String,
360 pub agent_type: String,
362 pub start_time: DateTime<Utc>,
364 pub end_time: Option<DateTime<Utc>>,
366 pub duration: Option<Duration>,
368 pub status: AgentExecutionStatus,
370 pub tokens_used: TokenUsage,
372 pub api_calls: usize,
374 pub api_calls_successful: usize,
376 pub tool_calls_count: usize,
378 pub cost: f64,
380 pub errors: Vec<ErrorRecord>,
382 pub timeout: Option<Duration>,
384}
385
386impl AgentMetrics {
387 pub fn new(agent_id: impl Into<String>, agent_type: impl Into<String>) -> Self {
389 Self {
390 agent_id: agent_id.into(),
391 agent_type: agent_type.into(),
392 start_time: Utc::now(),
393 end_time: None,
394 duration: None,
395 status: AgentExecutionStatus::Running,
396 tokens_used: TokenUsage::default(),
397 api_calls: 0,
398 api_calls_successful: 0,
399 tool_calls_count: 0,
400 cost: 0.0,
401 errors: Vec::new(),
402 timeout: None,
403 }
404 }
405
406 pub fn with_timeout(mut self, timeout: Duration) -> Self {
408 self.timeout = Some(timeout);
409 self
410 }
411
412 pub fn error_rate(&self) -> f32 {
414 if self.api_calls == 0 {
415 0.0
416 } else {
417 (self.api_calls - self.api_calls_successful) as f32 / self.api_calls as f32
418 }
419 }
420
421 pub fn is_timed_out(&self) -> bool {
423 if let Some(timeout) = self.timeout {
424 if let Some(duration) = self.duration {
425 return duration > timeout;
426 }
427 let elapsed = Utc::now().signed_duration_since(self.start_time);
429 if let Ok(elapsed_std) = elapsed.to_std() {
430 return elapsed_std > timeout;
431 }
432 }
433 false
434 }
435}
436
437#[derive(Debug, Clone, Serialize, Deserialize)]
439#[serde(rename_all = "camelCase")]
440pub struct AlertThresholds {
441 pub cost_threshold: Option<f64>,
443 pub error_rate_threshold: Option<f32>,
445 pub latency_threshold: Option<Duration>,
447}
448
449impl Default for AlertThresholds {
450 fn default() -> Self {
451 Self {
452 cost_threshold: Some(1.0), error_rate_threshold: Some(0.1), latency_threshold: Some(Duration::from_secs(30)), }
456 }
457}
458
459#[derive(Debug)]
461pub struct AlertManager {
462 alerts: HashMap<String, Alert>,
464 thresholds: AlertThresholds,
466}
467
468impl Default for AlertManager {
469 fn default() -> Self {
470 Self::new()
471 }
472}
473
474impl AlertManager {
475 pub fn new() -> Self {
477 Self {
478 alerts: HashMap::new(),
479 thresholds: AlertThresholds::default(),
480 }
481 }
482
483 pub fn with_thresholds(thresholds: AlertThresholds) -> Self {
485 Self {
486 alerts: HashMap::new(),
487 thresholds,
488 }
489 }
490
491 pub fn add_alert(&mut self, alert: Alert) -> String {
493 let id = alert.id.clone();
494 self.alerts.insert(id.clone(), alert);
495 id
496 }
497
498 pub fn check_timeout(&mut self, metrics: &AgentMetrics) -> Option<Alert> {
500 if let Some(timeout) = metrics.timeout {
501 let duration = metrics.duration.unwrap_or_else(|| {
502 let elapsed = Utc::now().signed_duration_since(metrics.start_time);
503 elapsed.to_std().unwrap_or(Duration::ZERO)
504 });
505
506 if duration > timeout {
507 let alert = Alert::timeout(&metrics.agent_id, duration, timeout);
508 let id = alert.id.clone();
509 self.alerts.insert(id, alert.clone());
510 return Some(alert);
511 }
512 }
513 None
514 }
515
516 pub fn check_cost(&mut self, metrics: &AgentMetrics, threshold: f64) -> Option<Alert> {
518 if metrics.cost > threshold {
519 let alert = Alert::cost_threshold(&metrics.agent_id, metrics.cost, threshold);
520 let id = alert.id.clone();
521 self.alerts.insert(id, alert.clone());
522 return Some(alert);
523 }
524 None
525 }
526
527 pub fn check_errors(&mut self, metrics: &AgentMetrics, threshold: f32) -> Option<Alert> {
529 let error_rate = metrics.error_rate();
530 if error_rate > threshold {
531 let alert = Alert::error_rate(&metrics.agent_id, error_rate, threshold);
532 let id = alert.id.clone();
533 self.alerts.insert(id, alert.clone());
534 return Some(alert);
535 }
536 None
537 }
538
539 pub fn check_all(&mut self, metrics: &AgentMetrics) -> Vec<Alert> {
541 let mut alerts = Vec::new();
542
543 if let Some(alert) = self.check_timeout(metrics) {
544 alerts.push(alert);
545 }
546
547 if let Some(threshold) = self.thresholds.cost_threshold {
548 if let Some(alert) = self.check_cost(metrics, threshold) {
549 alerts.push(alert);
550 }
551 }
552
553 if let Some(threshold) = self.thresholds.error_rate_threshold {
554 if let Some(alert) = self.check_errors(metrics, threshold) {
555 alerts.push(alert);
556 }
557 }
558
559 alerts
560 }
561
562 pub fn get_alert(&self, alert_id: &str) -> Option<&Alert> {
564 self.alerts.get(alert_id)
565 }
566
567 pub fn get_alert_mut(&mut self, alert_id: &str) -> Option<&mut Alert> {
569 self.alerts.get_mut(alert_id)
570 }
571
572 pub fn get_active_alerts(&self) -> Vec<&Alert> {
574 self.alerts.values().filter(|a| a.is_active()).collect()
575 }
576
577 pub fn get_all_alerts(&self) -> Vec<&Alert> {
579 self.alerts.values().collect()
580 }
581
582 pub fn get_alerts_by_agent(&self, agent_id: &str) -> Vec<&Alert> {
584 self.alerts
585 .values()
586 .filter(|a| a.agent_id == agent_id)
587 .collect()
588 }
589
590 pub fn get_alerts_by_severity(&self, severity: AlertSeverity) -> Vec<&Alert> {
592 self.alerts
593 .values()
594 .filter(|a| a.severity == severity)
595 .collect()
596 }
597
598 pub fn get_alerts_by_type(&self, alert_type: &AlertType) -> Vec<&Alert> {
600 self.alerts
601 .values()
602 .filter(|a| &a.alert_type == alert_type)
603 .collect()
604 }
605
606 pub fn acknowledge(&mut self, alert_id: &str) -> bool {
608 if let Some(alert) = self.alerts.get_mut(alert_id) {
609 alert.acknowledge();
610 true
611 } else {
612 false
613 }
614 }
615
616 pub fn acknowledge_all(&mut self) {
618 for alert in self.alerts.values_mut() {
619 alert.acknowledge();
620 }
621 }
622
623 pub fn clear_acknowledged(&mut self) -> usize {
625 let before = self.alerts.len();
626 self.alerts.retain(|_, alert| !alert.acknowledged);
627 before - self.alerts.len()
628 }
629
630 pub fn clear_all(&mut self) {
632 self.alerts.clear();
633 }
634
635 pub fn alert_count(&self) -> usize {
637 self.alerts.len()
638 }
639
640 pub fn active_alert_count(&self) -> usize {
642 self.alerts.values().filter(|a| a.is_active()).count()
643 }
644
645 pub fn set_thresholds(&mut self, thresholds: AlertThresholds) {
647 self.thresholds = thresholds;
648 }
649
650 pub fn thresholds(&self) -> &AlertThresholds {
652 &self.thresholds
653 }
654}
655
656#[cfg(test)]
657mod tests {
658 use super::*;
659
660 #[test]
661 fn test_alert_severity_ordering() {
662 assert!(AlertSeverity::Low < AlertSeverity::Medium);
663 assert!(AlertSeverity::Medium < AlertSeverity::High);
664 assert!(AlertSeverity::High < AlertSeverity::Critical);
665 }
666
667 #[test]
668 fn test_alert_creation() {
669 let alert = Alert::new(
670 AlertType::Timeout,
671 AlertSeverity::High,
672 "agent-1",
673 "Test alert",
674 );
675
676 assert!(!alert.id.is_empty());
677 assert_eq!(alert.alert_type, AlertType::Timeout);
678 assert_eq!(alert.severity, AlertSeverity::High);
679 assert_eq!(alert.agent_id, "agent-1");
680 assert_eq!(alert.message, "Test alert");
681 assert!(!alert.acknowledged);
682 assert!(alert.is_active());
683 }
684
685 #[test]
686 fn test_timeout_alert() {
687 let alert = Alert::timeout("agent-1", Duration::from_secs(70), Duration::from_secs(30));
688
689 assert_eq!(alert.alert_type, AlertType::Timeout);
690 assert_eq!(alert.severity, AlertSeverity::Critical); assert!(alert.message.contains("agent-1"));
692 assert!(alert.metadata.is_some());
693 }
694
695 #[test]
696 fn test_cost_threshold_alert() {
697 let alert = Alert::cost_threshold("agent-1", 2.5, 1.0);
698
699 assert_eq!(alert.alert_type, AlertType::CostThreshold);
700 assert_eq!(alert.severity, AlertSeverity::Critical); assert!(alert.message.contains("$2.5"));
702 }
703
704 #[test]
705 fn test_error_rate_alert() {
706 let alert = Alert::error_rate("agent-1", 0.6, 0.1);
707
708 assert_eq!(alert.alert_type, AlertType::ErrorRate);
709 assert_eq!(alert.severity, AlertSeverity::High); assert!(alert.message.contains("60.0%"));
711 }
712
713 #[test]
714 fn test_alert_acknowledge() {
715 let mut alert = Alert::new(AlertType::Timeout, AlertSeverity::High, "agent-1", "Test");
716
717 assert!(alert.is_active());
718 alert.acknowledge();
719 assert!(!alert.is_active());
720 assert!(alert.acknowledged);
721 }
722
723 #[test]
724 fn test_alert_manager_basic() {
725 let mut manager = AlertManager::new();
726
727 let alert = Alert::new(AlertType::Timeout, AlertSeverity::High, "agent-1", "Test");
728 let id = manager.add_alert(alert);
729
730 assert_eq!(manager.alert_count(), 1);
731 assert!(manager.get_alert(&id).is_some());
732 }
733
734 #[test]
735 fn test_alert_manager_check_timeout() {
736 let mut manager = AlertManager::new();
737
738 let mut metrics = AgentMetrics::new("agent-1", "test");
739 metrics.timeout = Some(Duration::from_secs(10));
740 metrics.duration = Some(Duration::from_secs(20));
741
742 let alert = manager.check_timeout(&metrics);
743 assert!(alert.is_some());
744 assert_eq!(manager.alert_count(), 1);
745 }
746
747 #[test]
748 fn test_alert_manager_check_cost() {
749 let mut manager = AlertManager::new();
750
751 let mut metrics = AgentMetrics::new("agent-1", "test");
752 metrics.cost = 2.0;
753
754 let alert = manager.check_cost(&metrics, 1.0);
755 assert!(alert.is_some());
756 assert_eq!(manager.alert_count(), 1);
757 }
758
759 #[test]
760 fn test_alert_manager_check_errors() {
761 let mut manager = AlertManager::new();
762
763 let mut metrics = AgentMetrics::new("agent-1", "test");
764 metrics.api_calls = 10;
765 metrics.api_calls_successful = 5; let alert = manager.check_errors(&metrics, 0.1);
768 assert!(alert.is_some());
769 assert_eq!(manager.alert_count(), 1);
770 }
771
772 #[test]
773 fn test_alert_manager_acknowledge() {
774 let mut manager = AlertManager::new();
775
776 let alert = Alert::new(AlertType::Timeout, AlertSeverity::High, "agent-1", "Test");
777 let id = manager.add_alert(alert);
778
779 assert_eq!(manager.active_alert_count(), 1);
780 assert!(manager.acknowledge(&id));
781 assert_eq!(manager.active_alert_count(), 0);
782 }
783
784 #[test]
785 fn test_alert_manager_clear_acknowledged() {
786 let mut manager = AlertManager::new();
787
788 let alert1 = Alert::new(AlertType::Timeout, AlertSeverity::High, "agent-1", "Test 1");
789 let alert2 = Alert::new(
790 AlertType::CostThreshold,
791 AlertSeverity::Medium,
792 "agent-2",
793 "Test 2",
794 );
795
796 let id1 = manager.add_alert(alert1);
797 manager.add_alert(alert2);
798
799 manager.acknowledge(&id1);
800 let cleared = manager.clear_acknowledged();
801
802 assert_eq!(cleared, 1);
803 assert_eq!(manager.alert_count(), 1);
804 }
805
806 #[test]
807 fn test_alert_manager_get_active_alerts() {
808 let mut manager = AlertManager::new();
809
810 let alert1 = Alert::new(AlertType::Timeout, AlertSeverity::High, "agent-1", "Test 1");
811 let alert2 = Alert::new(
812 AlertType::CostThreshold,
813 AlertSeverity::Medium,
814 "agent-2",
815 "Test 2",
816 );
817
818 let id1 = manager.add_alert(alert1);
819 manager.add_alert(alert2);
820
821 manager.acknowledge(&id1);
822
823 let active = manager.get_active_alerts();
824 assert_eq!(active.len(), 1);
825 assert_eq!(active[0].agent_id, "agent-2");
826 }
827
828 #[test]
829 fn test_alert_manager_get_by_severity() {
830 let mut manager = AlertManager::new();
831
832 manager.add_alert(Alert::new(
833 AlertType::Timeout,
834 AlertSeverity::High,
835 "agent-1",
836 "Test 1",
837 ));
838 manager.add_alert(Alert::new(
839 AlertType::CostThreshold,
840 AlertSeverity::High,
841 "agent-2",
842 "Test 2",
843 ));
844 manager.add_alert(Alert::new(
845 AlertType::ErrorRate,
846 AlertSeverity::Medium,
847 "agent-3",
848 "Test 3",
849 ));
850
851 let high_alerts = manager.get_alerts_by_severity(AlertSeverity::High);
852 assert_eq!(high_alerts.len(), 2);
853
854 let medium_alerts = manager.get_alerts_by_severity(AlertSeverity::Medium);
855 assert_eq!(medium_alerts.len(), 1);
856 }
857
858 #[test]
859 fn test_agent_metrics_error_rate() {
860 let mut metrics = AgentMetrics::new("agent-1", "test");
861 metrics.api_calls = 10;
862 metrics.api_calls_successful = 8;
863
864 assert!((metrics.error_rate() - 0.2).abs() < 0.001);
865 }
866
867 #[test]
868 fn test_agent_metrics_error_rate_zero_calls() {
869 let metrics = AgentMetrics::new("agent-1", "test");
870 assert_eq!(metrics.error_rate(), 0.0);
871 }
872}