1use super::{GpuMetrics, MemoryMetrics, PerformanceMetrics, SynthesisMetrics, SystemMetrics};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::sync::Arc;
10use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
11use tokio::sync::{mpsc, watch, RwLock};
12
13#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct MonitorConfig {
16 pub interval: Duration,
18 pub enabled: bool,
20 pub thresholds: AlertThresholds,
22 pub alerts: AlertConfig,
24 pub targets: Vec<MonitorTarget>,
26 pub retention_duration: Duration,
28 pub auto_recovery: AutoRecoveryConfig,
30}
31
32#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct AlertThresholds {
35 pub cpu_usage_percent: Option<f64>,
37 pub memory_usage_percent: Option<f64>,
39 pub gpu_utilization_percent: Option<f64>,
41 pub gpu_memory_percent: Option<f64>,
43 pub min_real_time_factor: Option<f64>,
45 pub max_synthesis_time_ms: Option<f64>,
47 pub max_queue_depth: Option<usize>,
49 pub min_success_rate_percent: Option<f64>,
51 pub max_error_rate_percent: Option<f64>,
53 pub max_gpu_temperature: Option<f64>,
55 pub max_disk_usage_percent: Option<f64>,
57 pub max_network_bps: Option<u64>,
59}
60
61#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct AlertConfig {
64 pub console_logging: bool,
66 pub file_logging: Option<std::path::PathBuf>,
68 pub email_notifications: Option<EmailConfig>,
70 pub webhook_notifications: Option<WebhookConfig>,
72 pub cooldown_duration: Duration,
74 pub max_alerts_per_hour: usize,
76}
77
78#[derive(Debug, Clone, Serialize, Deserialize)]
80pub struct EmailConfig {
81 pub smtp_server: String,
83 pub smtp_port: u16,
85 pub username: String,
87 pub password: String,
89 pub from_email: String,
91 pub to_emails: Vec<String>,
93 pub use_tls: bool,
95}
96
97#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct WebhookConfig {
100 pub url: String,
102 pub method: String,
104 pub headers: HashMap<String, String>,
106 pub timeout_seconds: u64,
108 pub retry_attempts: usize,
110}
111
112#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct MonitorTarget {
115 pub name: String,
117 pub target_type: MonitorTargetType,
119 pub enabled: bool,
121 pub custom_thresholds: Option<AlertThresholds>,
123 pub severity: AlertSeverity,
125}
126
127#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
129pub enum MonitorTargetType {
130 SystemCpu,
132 SystemMemory,
134 Gpu,
136 SynthesisPerformance,
138 QueueDepth,
140 ErrorRate,
142 DiskIo,
144 NetworkIo,
146 Custom(String),
148}
149
150#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
152pub enum AlertSeverity {
153 Info,
155 Warning,
157 Critical,
159 Emergency,
161}
162
163#[derive(Debug, Clone, Serialize, Deserialize)]
165pub struct AutoRecoveryConfig {
166 pub enabled: bool,
168 pub actions: Vec<RecoveryAction>,
170 pub max_attempts_per_hour: usize,
172 pub retry_delay: Duration,
174}
175
176#[derive(Debug, Clone, Serialize, Deserialize)]
178pub enum RecoveryAction {
179 ReduceBatchSize { min_size: usize },
181 ClearCaches,
183 RestartWorkers,
185 ReduceParallelism { min_threads: usize },
187 EnableMemoryOptimization,
189 ReduceQuality,
191 PauseProcessing { duration: Duration },
193 CustomCommand { command: String, args: Vec<String> },
195}
196
197#[derive(Debug, Clone, Serialize, Deserialize)]
199pub struct PerformanceAlert {
200 pub id: String,
202 pub timestamp: u64,
204 pub severity: AlertSeverity,
206 pub target: MonitorTargetType,
208 pub message: String,
210 pub current_value: f64,
212 pub threshold_value: f64,
214 pub metric_name: String,
216 pub context: HashMap<String, String>,
218 pub resolved: bool,
220 pub resolved_at: Option<u64>,
222}
223
224#[derive(Debug)]
226pub struct AlertManager {
227 config: AlertConfig,
229 active_alerts: Arc<RwLock<HashMap<String, PerformanceAlert>>>,
231 alert_history: Arc<RwLock<Vec<PerformanceAlert>>>,
233 alert_cooldowns: Arc<RwLock<HashMap<String, Instant>>>,
235 hourly_alert_count: Arc<RwLock<usize>>,
237 last_hour_reset: Arc<RwLock<Instant>>,
239}
240
241pub struct PerformanceMonitor {
243 config: MonitorConfig,
245 alert_manager: AlertManager,
247 current_metrics: Arc<RwLock<Option<PerformanceMetrics>>>,
249 is_running: Arc<RwLock<bool>>,
251 metrics_history: Arc<RwLock<Vec<PerformanceMetrics>>>,
253 alert_sender: mpsc::UnboundedSender<PerformanceAlert>,
255 alert_receiver: Arc<RwLock<Option<mpsc::UnboundedReceiver<PerformanceAlert>>>>,
257 shutdown_sender: Option<watch::Sender<bool>>,
259}
260
261impl PerformanceMonitor {
262 pub fn new(config: MonitorConfig) -> Self {
264 let (alert_sender, alert_receiver) = mpsc::unbounded_channel();
265 let (shutdown_sender, _) = watch::channel(false);
266
267 Self {
268 alert_manager: AlertManager::new(config.alerts.clone()),
269 config,
270 current_metrics: Arc::new(RwLock::new(None)),
271 is_running: Arc::new(RwLock::new(false)),
272 metrics_history: Arc::new(RwLock::new(Vec::new())),
273 alert_sender,
274 alert_receiver: Arc::new(RwLock::new(Some(alert_receiver))),
275 shutdown_sender: Some(shutdown_sender),
276 }
277 }
278
279 pub async fn start(&self) -> Result<(), Box<dyn std::error::Error>> {
281 let mut is_running = self.is_running.write().await;
282 if *is_running {
283 return Ok(());
284 }
285 *is_running = true;
286 drop(is_running);
287
288 tracing::info!(
289 "Starting performance monitor with interval: {:?}",
290 self.config.interval
291 );
292
293 self.start_alert_processor().await?;
295
296 self.start_monitoring_loop().await?;
298
299 Ok(())
300 }
301
302 pub async fn stop(&self) -> Result<(), Box<dyn std::error::Error>> {
304 let mut is_running = self.is_running.write().await;
305 if !*is_running {
306 return Ok(());
307 }
308 *is_running = false;
309
310 if let Some(sender) = &self.shutdown_sender {
312 let _ = sender.send(true);
313 }
314
315 tracing::info!("Stopped performance monitor");
316 Ok(())
317 }
318
319 pub async fn update_metrics(&self, metrics: PerformanceMetrics) {
321 let mut current = self.current_metrics.write().await;
323 *current = Some(metrics.clone());
324 drop(current);
325
326 let mut history = self.metrics_history.write().await;
328 history.push(metrics.clone());
329
330 let max_history =
332 (self.config.retention_duration.as_secs() / self.config.interval.as_secs()) as usize;
333 if history.len() > max_history {
334 history.remove(0);
335 }
336 drop(history);
337
338 self.check_alerts(&metrics).await;
340 }
341
342 async fn check_alerts(&self, metrics: &PerformanceMetrics) {
344 for target in &self.config.targets {
345 if !target.enabled {
346 continue;
347 }
348
349 let thresholds = target
350 .custom_thresholds
351 .as_ref()
352 .unwrap_or(&self.config.thresholds);
353
354 if let Some(alert) = self.check_target_alerts(target, metrics, thresholds).await {
355 let _ = self.alert_sender.send(alert);
356 }
357 }
358 }
359
360 async fn check_target_alerts(
362 &self,
363 target: &MonitorTarget,
364 metrics: &PerformanceMetrics,
365 thresholds: &AlertThresholds,
366 ) -> Option<PerformanceAlert> {
367 match target.target_type {
368 MonitorTargetType::SystemCpu => {
369 if let Some(threshold) = thresholds.cpu_usage_percent {
370 if metrics.system.cpu_usage > threshold {
371 return Some(
372 self.create_alert(
373 target,
374 "High CPU usage detected",
375 metrics.system.cpu_usage,
376 threshold,
377 "cpu_usage_percent",
378 )
379 .await,
380 );
381 }
382 }
383 }
384 MonitorTargetType::SystemMemory => {
385 let memory_usage_percent = (metrics.system.memory_used as f64
386 / (metrics.system.memory_used + metrics.system.memory_available) as f64)
387 * 100.0;
388
389 if let Some(threshold) = thresholds.memory_usage_percent {
390 if memory_usage_percent > threshold {
391 return Some(
392 self.create_alert(
393 target,
394 "High memory usage detected",
395 memory_usage_percent,
396 threshold,
397 "memory_usage_percent",
398 )
399 .await,
400 );
401 }
402 }
403 }
404 MonitorTargetType::Gpu => {
405 if let Some(ref gpu_metrics) = metrics.gpu {
406 if let Some(threshold) = thresholds.gpu_utilization_percent {
408 if gpu_metrics.utilization > threshold {
409 return Some(
410 self.create_alert(
411 target,
412 "High GPU utilization detected",
413 gpu_metrics.utilization,
414 threshold,
415 "gpu_utilization_percent",
416 )
417 .await,
418 );
419 }
420 }
421
422 let gpu_memory_percent =
424 (gpu_metrics.memory_used as f64 / gpu_metrics.memory_total as f64) * 100.0;
425 if let Some(threshold) = thresholds.gpu_memory_percent {
426 if gpu_memory_percent > threshold {
427 return Some(
428 self.create_alert(
429 target,
430 "High GPU memory usage detected",
431 gpu_memory_percent,
432 threshold,
433 "gpu_memory_percent",
434 )
435 .await,
436 );
437 }
438 }
439
440 if let Some(threshold) = thresholds.max_gpu_temperature {
442 if gpu_metrics.temperature > threshold {
443 return Some(
444 self.create_alert(
445 target,
446 "High GPU temperature detected",
447 gpu_metrics.temperature,
448 threshold,
449 "gpu_temperature",
450 )
451 .await,
452 );
453 }
454 }
455 }
456 }
457 MonitorTargetType::SynthesisPerformance => {
458 if let Some(threshold) = thresholds.min_real_time_factor {
460 if metrics.synthesis.real_time_factor < threshold {
461 return Some(
462 self.create_alert(
463 target,
464 "Poor synthesis performance detected",
465 metrics.synthesis.real_time_factor,
466 threshold,
467 "real_time_factor",
468 )
469 .await,
470 );
471 }
472 }
473
474 if let Some(threshold) = thresholds.max_synthesis_time_ms {
476 if metrics.synthesis.avg_synthesis_time_ms > threshold {
477 return Some(
478 self.create_alert(
479 target,
480 "High synthesis time detected",
481 metrics.synthesis.avg_synthesis_time_ms,
482 threshold,
483 "synthesis_time_ms",
484 )
485 .await,
486 );
487 }
488 }
489 }
490 MonitorTargetType::QueueDepth => {
491 if let Some(threshold) = thresholds.max_queue_depth {
492 if metrics.synthesis.queue_depth > threshold {
493 return Some(
494 self.create_alert(
495 target,
496 "High queue depth detected",
497 metrics.synthesis.queue_depth as f64,
498 threshold as f64,
499 "queue_depth",
500 )
501 .await,
502 );
503 }
504 }
505 }
506 MonitorTargetType::ErrorRate => {
507 let error_rate = if metrics.synthesis.total_operations > 0 {
508 (metrics.synthesis.failed_operations as f64
509 / metrics.synthesis.total_operations as f64)
510 * 100.0
511 } else {
512 0.0
513 };
514
515 if let Some(threshold) = thresholds.max_error_rate_percent {
516 if error_rate > threshold {
517 return Some(
518 self.create_alert(
519 target,
520 "High error rate detected",
521 error_rate,
522 threshold,
523 "error_rate_percent",
524 )
525 .await,
526 );
527 }
528 }
529 }
530 MonitorTargetType::DiskIo => {
531 let total_disk_bps = metrics.system.disk_read_bps + metrics.system.disk_write_bps;
532 if let Some(threshold) = thresholds.max_network_bps {
533 if total_disk_bps > threshold {
534 return Some(
535 self.create_alert(
536 target,
537 "High disk I/O detected",
538 total_disk_bps as f64,
539 threshold as f64,
540 "disk_io_bps",
541 )
542 .await,
543 );
544 }
545 }
546 }
547 MonitorTargetType::NetworkIo => {
548 if let Some(threshold) = thresholds.max_network_bps {
549 if metrics.system.network_bps > threshold {
550 return Some(
551 self.create_alert(
552 target,
553 "High network I/O detected",
554 metrics.system.network_bps as f64,
555 threshold as f64,
556 "network_io_bps",
557 )
558 .await,
559 );
560 }
561 }
562 }
563 MonitorTargetType::Custom(_) => {
564 }
566 }
567
568 None
569 }
570
571 async fn create_alert(
573 &self,
574 target: &MonitorTarget,
575 message: &str,
576 current_value: f64,
577 threshold_value: f64,
578 metric_name: &str,
579 ) -> PerformanceAlert {
580 let timestamp = SystemTime::now()
581 .duration_since(UNIX_EPOCH)
582 .unwrap_or_default()
583 .as_secs();
584
585 let alert_id = format!(
586 "{}_{}_{}_{}",
587 target.name,
588 metric_name,
589 timestamp,
590 fastrand::u32(..)
591 );
592
593 let mut context = HashMap::new();
594 context.insert("target_name".to_string(), target.name.clone());
595 context.insert("metric_name".to_string(), metric_name.to_string());
596 context.insert("current_value".to_string(), current_value.to_string());
597 context.insert("threshold_value".to_string(), threshold_value.to_string());
598
599 PerformanceAlert {
600 id: alert_id,
601 timestamp,
602 severity: target.severity.clone(),
603 target: target.target_type.clone(),
604 message: message.to_string(),
605 current_value,
606 threshold_value,
607 metric_name: metric_name.to_string(),
608 context,
609 resolved: false,
610 resolved_at: None,
611 }
612 }
613
614 async fn start_alert_processor(&self) -> Result<(), Box<dyn std::error::Error>> {
616 let mut receiver = self.alert_receiver.write().await;
617 if let Some(rx) = receiver.take() {
618 let alert_manager = self.alert_manager.clone();
619 let auto_recovery = self.config.auto_recovery.clone();
620
621 tokio::spawn(async move {
622 let mut rx = rx;
623 while let Some(alert) = rx.recv().await {
624 alert_manager.process_alert(alert.clone()).await;
626
627 if auto_recovery.enabled {
629 Self::attempt_auto_recovery(&alert, &auto_recovery).await;
630 }
631 }
632 });
633 }
634
635 Ok(())
636 }
637
638 async fn start_monitoring_loop(&self) -> Result<(), Box<dyn std::error::Error>> {
640 let is_running = self.is_running.clone();
641 let interval = self.config.interval;
642
643 tokio::spawn(async move {
644 let mut interval_timer = tokio::time::interval(interval);
645
646 loop {
647 interval_timer.tick().await;
648
649 let running = is_running.read().await;
650 if !*running {
651 break;
652 }
653 drop(running);
654
655 tracing::debug!("Performance monitoring tick");
658 }
659 });
660
661 Ok(())
662 }
663
664 async fn attempt_auto_recovery(alert: &PerformanceAlert, config: &AutoRecoveryConfig) {
666 for action in &config.actions {
667 match action {
668 RecoveryAction::ReduceBatchSize { min_size } => {
669 tracing::info!("Auto-recovery: Reducing batch size (min: {})", min_size);
670 }
672 RecoveryAction::ClearCaches => {
673 tracing::info!("Auto-recovery: Clearing caches");
674 }
676 RecoveryAction::RestartWorkers => {
677 tracing::info!("Auto-recovery: Restarting worker threads");
678 }
680 RecoveryAction::ReduceParallelism { min_threads } => {
681 tracing::info!("Auto-recovery: Reducing parallelism (min: {})", min_threads);
682 }
684 RecoveryAction::EnableMemoryOptimization => {
685 tracing::info!("Auto-recovery: Enabling memory optimization");
686 }
688 RecoveryAction::ReduceQuality => {
689 tracing::info!("Auto-recovery: Reducing synthesis quality");
690 }
692 RecoveryAction::PauseProcessing { duration } => {
693 tracing::info!("Auto-recovery: Pausing processing for {:?}", duration);
694 tokio::time::sleep(*duration).await;
695 }
696 RecoveryAction::CustomCommand { command, args } => {
697 tracing::info!(
698 "Auto-recovery: Executing custom command: {} {:?}",
699 command,
700 args
701 );
702 }
704 }
705
706 tokio::time::sleep(config.retry_delay).await;
708 }
709 }
710
711 pub async fn is_running(&self) -> bool {
713 *self.is_running.read().await
714 }
715
716 pub async fn get_current_metrics(&self) -> Option<PerformanceMetrics> {
718 self.current_metrics.read().await.clone()
719 }
720
721 pub async fn get_active_alerts(&self) -> Vec<PerformanceAlert> {
723 self.alert_manager.get_active_alerts().await
724 }
725
726 pub async fn get_alert_history(&self, limit: Option<usize>) -> Vec<PerformanceAlert> {
728 self.alert_manager.get_alert_history(limit).await
729 }
730
731 pub async fn update_config(&mut self, config: MonitorConfig) {
733 self.config = config;
734 }
735}
736
737impl AlertManager {
738 pub fn new(config: AlertConfig) -> Self {
740 Self {
741 config,
742 active_alerts: Arc::new(RwLock::new(HashMap::new())),
743 alert_history: Arc::new(RwLock::new(Vec::new())),
744 alert_cooldowns: Arc::new(RwLock::new(HashMap::new())),
745 hourly_alert_count: Arc::new(RwLock::new(0)),
746 last_hour_reset: Arc::new(RwLock::new(Instant::now())),
747 }
748 }
749
750 async fn process_alert(&self, alert: PerformanceAlert) {
752 if self.is_in_cooldown(&alert).await {
754 return;
755 }
756
757 if !self.can_send_alert().await {
759 tracing::warn!(
760 "Alert rate limit exceeded, skipping alert: {}",
761 alert.message
762 );
763 return;
764 }
765
766 let mut active = self.active_alerts.write().await;
768 active.insert(alert.id.clone(), alert.clone());
769 drop(active);
770
771 let mut history = self.alert_history.write().await;
773 history.push(alert.clone());
774 drop(history);
775
776 self.set_cooldown(&alert).await;
778
779 self.send_notifications(&alert).await;
781
782 self.increment_hourly_count().await;
784 }
785
786 async fn is_in_cooldown(&self, alert: &PerformanceAlert) -> bool {
788 let cooldowns = self.alert_cooldowns.read().await;
789 if let Some(&last_sent) = cooldowns.get(&alert.metric_name) {
790 last_sent.elapsed() < self.config.cooldown_duration
791 } else {
792 false
793 }
794 }
795
796 async fn set_cooldown(&self, alert: &PerformanceAlert) {
798 let mut cooldowns = self.alert_cooldowns.write().await;
799 cooldowns.insert(alert.metric_name.clone(), Instant::now());
800 }
801
802 async fn can_send_alert(&self) -> bool {
804 let mut last_reset = self.last_hour_reset.write().await;
806 if last_reset.elapsed() >= Duration::from_secs(3600) {
807 *last_reset = Instant::now();
808 let mut count = self.hourly_alert_count.write().await;
809 *count = 0;
810 }
811 drop(last_reset);
812
813 let count = self.hourly_alert_count.read().await;
814 *count < self.config.max_alerts_per_hour
815 }
816
817 async fn increment_hourly_count(&self) {
819 let mut count = self.hourly_alert_count.write().await;
820 *count += 1;
821 }
822
823 async fn send_notifications(&self, alert: &PerformanceAlert) {
825 if self.config.console_logging {
827 match alert.severity {
828 AlertSeverity::Info => {
829 tracing::info!("ALERT [{}]: {}", alert.severity_string(), alert.message)
830 }
831 AlertSeverity::Warning => {
832 tracing::warn!("ALERT [{}]: {}", alert.severity_string(), alert.message)
833 }
834 AlertSeverity::Critical => {
835 tracing::error!("ALERT [{}]: {}", alert.severity_string(), alert.message)
836 }
837 AlertSeverity::Emergency => {
838 tracing::error!("ALERT [{}]: {}", alert.severity_string(), alert.message)
839 }
840 }
841 }
842
843 if let Some(ref log_path) = self.config.file_logging {
845 let log_entry = format!(
846 "{} [{}] {}: {} (current: {:.2}, threshold: {:.2})\n",
847 alert.timestamp,
848 alert.severity_string(),
849 alert.metric_name,
850 alert.message,
851 alert.current_value,
852 alert.threshold_value
853 );
854
855 if let Err(e) = tokio::fs::write(log_path, log_entry).await {
856 tracing::error!("Failed to write alert to log file: {}", e);
857 }
858 }
859
860 if let Some(ref email_config) = self.config.email_notifications {
862 if let Err(e) = self.send_email_alert(alert, email_config).await {
863 tracing::error!("Failed to send email alert: {}", e);
864 }
865 }
866
867 if let Some(ref webhook_config) = self.config.webhook_notifications {
869 if let Err(e) = self.send_webhook_alert(alert, webhook_config).await {
870 tracing::error!("Failed to send webhook alert: {}", e);
871 }
872 }
873 }
874
875 async fn send_email_alert(
877 &self,
878 alert: &PerformanceAlert,
879 config: &EmailConfig,
880 ) -> Result<(), Box<dyn std::error::Error>> {
881 tracing::info!(
884 "Would send email alert to {:?}: {}",
885 config.to_emails,
886 alert.message
887 );
888 Ok(())
889 }
890
891 async fn send_webhook_alert(
893 &self,
894 alert: &PerformanceAlert,
895 config: &WebhookConfig,
896 ) -> Result<(), Box<dyn std::error::Error>> {
897 let client = reqwest::Client::new();
898 let payload = serde_json::to_value(alert)?;
899
900 for attempt in 0..=config.retry_attempts {
901 let mut request = match config.method.to_uppercase().as_str() {
902 "POST" => client.post(&config.url),
903 "PUT" => client.put(&config.url),
904 "PATCH" => client.patch(&config.url),
905 _ => client.get(&config.url),
906 };
907
908 for (key, value) in &config.headers {
910 request = request.header(key, value);
911 }
912
913 if matches!(
915 config.method.to_uppercase().as_str(),
916 "POST" | "PUT" | "PATCH"
917 ) {
918 request = request.json(&payload);
919 }
920
921 let response = request
922 .timeout(Duration::from_secs(config.timeout_seconds))
923 .send()
924 .await;
925
926 match response {
927 Ok(resp) if resp.status().is_success() => {
928 tracing::info!("Webhook alert sent successfully to {}", config.url);
929 return Ok(());
930 }
931 Ok(resp) => {
932 tracing::warn!(
933 "Webhook alert failed with status {}: {}",
934 resp.status(),
935 config.url
936 );
937 }
938 Err(e) => {
939 tracing::warn!("Webhook alert attempt {} failed: {}", attempt + 1, e);
940 }
941 }
942
943 if attempt < config.retry_attempts {
944 tokio::time::sleep(Duration::from_secs(2_u64.pow(attempt as u32))).await;
945 }
946 }
947
948 Err("All webhook attempts failed".into())
949 }
950
951 async fn get_active_alerts(&self) -> Vec<PerformanceAlert> {
953 let active = self.active_alerts.read().await;
954 active.values().cloned().collect()
955 }
956
957 async fn get_alert_history(&self, limit: Option<usize>) -> Vec<PerformanceAlert> {
959 let history = self.alert_history.read().await;
960 if let Some(limit) = limit {
961 history.iter().rev().take(limit).cloned().collect()
962 } else {
963 history.clone()
964 }
965 }
966
967 pub async fn resolve_alert(&self, alert_id: &str) -> bool {
969 let mut active = self.active_alerts.write().await;
970 if let Some(mut alert) = active.remove(alert_id) {
971 alert.resolved = true;
972 alert.resolved_at = Some(
973 SystemTime::now()
974 .duration_since(UNIX_EPOCH)
975 .unwrap_or_default()
976 .as_secs(),
977 );
978
979 let mut history = self.alert_history.write().await;
981 if let Some(hist_alert) = history.iter_mut().find(|a| a.id == alert_id) {
982 hist_alert.resolved = true;
983 hist_alert.resolved_at = alert.resolved_at;
984 }
985
986 true
987 } else {
988 false
989 }
990 }
991}
992
993impl Clone for AlertManager {
994 fn clone(&self) -> Self {
995 Self {
996 config: self.config.clone(),
997 active_alerts: self.active_alerts.clone(),
998 alert_history: self.alert_history.clone(),
999 alert_cooldowns: self.alert_cooldowns.clone(),
1000 hourly_alert_count: self.hourly_alert_count.clone(),
1001 last_hour_reset: self.last_hour_reset.clone(),
1002 }
1003 }
1004}
1005
1006impl PerformanceAlert {
1007 pub fn severity_string(&self) -> &'static str {
1009 match self.severity {
1010 AlertSeverity::Info => "INFO",
1011 AlertSeverity::Warning => "WARNING",
1012 AlertSeverity::Critical => "CRITICAL",
1013 AlertSeverity::Emergency => "EMERGENCY",
1014 }
1015 }
1016}
1017
1018impl Default for MonitorConfig {
1019 fn default() -> Self {
1020 Self {
1021 interval: Duration::from_secs(10),
1022 enabled: false,
1023 thresholds: AlertThresholds::default(),
1024 alerts: AlertConfig::default(),
1025 targets: vec![
1026 MonitorTarget {
1027 name: "system_cpu".to_string(),
1028 target_type: MonitorTargetType::SystemCpu,
1029 enabled: true,
1030 custom_thresholds: None,
1031 severity: AlertSeverity::Warning,
1032 },
1033 MonitorTarget {
1034 name: "system_memory".to_string(),
1035 target_type: MonitorTargetType::SystemMemory,
1036 enabled: true,
1037 custom_thresholds: None,
1038 severity: AlertSeverity::Warning,
1039 },
1040 MonitorTarget {
1041 name: "synthesis_performance".to_string(),
1042 target_type: MonitorTargetType::SynthesisPerformance,
1043 enabled: true,
1044 custom_thresholds: None,
1045 severity: AlertSeverity::Critical,
1046 },
1047 ],
1048 retention_duration: Duration::from_secs(3600), auto_recovery: AutoRecoveryConfig::default(),
1050 }
1051 }
1052}
1053
1054impl Default for AlertThresholds {
1055 fn default() -> Self {
1056 Self {
1057 cpu_usage_percent: Some(80.0),
1058 memory_usage_percent: Some(85.0),
1059 gpu_utilization_percent: Some(95.0),
1060 gpu_memory_percent: Some(90.0),
1061 min_real_time_factor: Some(0.8),
1062 max_synthesis_time_ms: Some(5000.0),
1063 max_queue_depth: Some(20),
1064 min_success_rate_percent: Some(90.0),
1065 max_error_rate_percent: Some(10.0),
1066 max_gpu_temperature: Some(85.0),
1067 max_disk_usage_percent: Some(90.0),
1068 max_network_bps: Some(1_000_000_000), }
1070 }
1071}
1072
1073impl Default for AlertConfig {
1074 fn default() -> Self {
1075 Self {
1076 console_logging: true,
1077 file_logging: None,
1078 email_notifications: None,
1079 webhook_notifications: None,
1080 cooldown_duration: Duration::from_secs(300), max_alerts_per_hour: 20,
1082 }
1083 }
1084}
1085
1086impl Default for AutoRecoveryConfig {
1087 fn default() -> Self {
1088 Self {
1089 enabled: false,
1090 actions: vec![
1091 RecoveryAction::ReduceBatchSize { min_size: 8 },
1092 RecoveryAction::ClearCaches,
1093 RecoveryAction::ReduceParallelism { min_threads: 2 },
1094 ],
1095 max_attempts_per_hour: 5,
1096 retry_delay: Duration::from_secs(30),
1097 }
1098 }
1099}
1100
1101#[cfg(test)]
1102mod tests {
1103 use super::*;
1104
1105 #[tokio::test]
1106 async fn test_performance_monitor_creation() {
1107 let config = MonitorConfig::default();
1108 let monitor = PerformanceMonitor::new(config);
1109 assert!(!monitor.is_running().await);
1110 }
1111
1112 #[tokio::test]
1113 async fn test_alert_creation() {
1114 let config = MonitorConfig::default();
1115 let monitor = PerformanceMonitor::new(config);
1116
1117 let target = MonitorTarget {
1118 name: "test_cpu".to_string(),
1119 target_type: MonitorTargetType::SystemCpu,
1120 enabled: true,
1121 custom_thresholds: None,
1122 severity: AlertSeverity::Warning,
1123 };
1124
1125 let alert = monitor
1126 .create_alert(
1127 &target,
1128 "Test alert message",
1129 85.0,
1130 80.0,
1131 "cpu_usage_percent",
1132 )
1133 .await;
1134
1135 assert_eq!(alert.severity, AlertSeverity::Warning);
1136 assert_eq!(alert.current_value, 85.0);
1137 assert_eq!(alert.threshold_value, 80.0);
1138 assert!(!alert.resolved);
1139 }
1140
1141 #[tokio::test]
1142 async fn test_alert_manager() {
1143 let config = AlertConfig::default();
1144 let manager = AlertManager::new(config);
1145
1146 let mut alert = PerformanceAlert {
1147 id: "test_alert".to_string(),
1148 timestamp: 1234567890,
1149 severity: AlertSeverity::Warning,
1150 target: MonitorTargetType::SystemCpu,
1151 message: "Test alert".to_string(),
1152 current_value: 85.0,
1153 threshold_value: 80.0,
1154 metric_name: "cpu_usage".to_string(),
1155 context: HashMap::new(),
1156 resolved: false,
1157 resolved_at: None,
1158 };
1159
1160 manager.process_alert(alert.clone()).await;
1162
1163 let active = manager.get_active_alerts().await;
1165 assert_eq!(active.len(), 1);
1166 assert_eq!(active[0].id, "test_alert");
1167
1168 let resolved = manager.resolve_alert("test_alert").await;
1170 assert!(resolved);
1171
1172 let active = manager.get_active_alerts().await;
1174 assert_eq!(active.len(), 0);
1175 }
1176
1177 #[test]
1178 fn test_alert_thresholds_default() {
1179 let thresholds = AlertThresholds::default();
1180 assert_eq!(thresholds.cpu_usage_percent, Some(80.0));
1181 assert_eq!(thresholds.memory_usage_percent, Some(85.0));
1182 assert_eq!(thresholds.min_real_time_factor, Some(0.8));
1183 }
1184
1185 #[test]
1186 fn test_recovery_actions() {
1187 let action = RecoveryAction::ReduceBatchSize { min_size: 8 };
1188 match action {
1189 RecoveryAction::ReduceBatchSize { min_size } => {
1190 assert_eq!(min_size, 8);
1191 }
1192 _ => panic!("Wrong action type"),
1193 }
1194 }
1195
1196 #[tokio::test]
1197 async fn test_monitor_start_stop() {
1198 let config = MonitorConfig::default();
1199 let mut monitor = PerformanceMonitor::new(config);
1200
1201 assert!(!monitor.is_running().await);
1202
1203 monitor.start().await.unwrap();
1204 assert!(monitor.is_running().await);
1205
1206 monitor.stop().await.unwrap();
1207 assert!(!monitor.is_running().await);
1208 }
1209}