Skip to main content

voirs_cli/performance/
monitor.rs

1//! Real-time performance monitoring and alerting
2//!
3//! This module provides continuous monitoring of system performance with
4//! configurable thresholds, alerts, and automated responses.
5
6use super::{GpuMetrics, MemoryMetrics, PerformanceMetrics, SynthesisMetrics, SystemMetrics};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::sync::Arc;
10use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
11use tokio::sync::{mpsc, watch, RwLock};
12
13/// Performance monitoring configuration
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct MonitorConfig {
16    /// Monitoring interval
17    pub interval: Duration,
18    /// Enable monitoring
19    pub enabled: bool,
20    /// Alert thresholds
21    pub thresholds: AlertThresholds,
22    /// Alert channels configuration
23    pub alerts: AlertConfig,
24    /// Monitoring targets
25    pub targets: Vec<MonitorTarget>,
26    /// Historical data retention
27    pub retention_duration: Duration,
28    /// Auto-recovery settings
29    pub auto_recovery: AutoRecoveryConfig,
30}
31
32/// Alert threshold configurations
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct AlertThresholds {
35    /// CPU usage percentage (0-100)
36    pub cpu_usage_percent: Option<f64>,
37    /// Memory usage percentage (0-100)
38    pub memory_usage_percent: Option<f64>,
39    /// GPU utilization percentage (0-100)
40    pub gpu_utilization_percent: Option<f64>,
41    /// GPU memory usage percentage (0-100)
42    pub gpu_memory_percent: Option<f64>,
43    /// Real-time factor minimum threshold
44    pub min_real_time_factor: Option<f64>,
45    /// Maximum synthesis time in milliseconds
46    pub max_synthesis_time_ms: Option<f64>,
47    /// Maximum queue depth
48    pub max_queue_depth: Option<usize>,
49    /// Minimum success rate percentage
50    pub min_success_rate_percent: Option<f64>,
51    /// Maximum error rate percentage
52    pub max_error_rate_percent: Option<f64>,
53    /// GPU temperature threshold in Celsius
54    pub max_gpu_temperature: Option<f64>,
55    /// Disk usage percentage
56    pub max_disk_usage_percent: Option<f64>,
57    /// Network bandwidth usage in bytes/sec
58    pub max_network_bps: Option<u64>,
59}
60
61/// Alert configuration
62#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct AlertConfig {
64    /// Enable console logging of alerts
65    pub console_logging: bool,
66    /// Enable file logging of alerts
67    pub file_logging: Option<std::path::PathBuf>,
68    /// Enable email notifications
69    pub email_notifications: Option<EmailConfig>,
70    /// Enable webhook notifications
71    pub webhook_notifications: Option<WebhookConfig>,
72    /// Alert cooldown period to prevent spam
73    pub cooldown_duration: Duration,
74    /// Maximum alerts per hour
75    pub max_alerts_per_hour: usize,
76}
77
78/// Email notification configuration
79#[derive(Debug, Clone, Serialize, Deserialize)]
80pub struct EmailConfig {
81    /// SMTP server address
82    pub smtp_server: String,
83    /// SMTP port
84    pub smtp_port: u16,
85    /// Username for authentication
86    pub username: String,
87    /// Password for authentication (should be encrypted/secured)
88    pub password: String,
89    /// Sender email address
90    pub from_email: String,
91    /// Recipient email addresses
92    pub to_emails: Vec<String>,
93    /// Use TLS encryption
94    pub use_tls: bool,
95}
96
97/// Webhook notification configuration
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct WebhookConfig {
100    /// Webhook URL
101    pub url: String,
102    /// HTTP method (GET, POST, etc.)
103    pub method: String,
104    /// Custom headers
105    pub headers: HashMap<String, String>,
106    /// Request timeout in seconds
107    pub timeout_seconds: u64,
108    /// Retry attempts
109    pub retry_attempts: usize,
110}
111
112/// Monitoring target configuration
113#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct MonitorTarget {
115    /// Target name/identifier
116    pub name: String,
117    /// Target type
118    pub target_type: MonitorTargetType,
119    /// Enable monitoring for this target
120    pub enabled: bool,
121    /// Custom thresholds for this target
122    pub custom_thresholds: Option<AlertThresholds>,
123    /// Alert severity level
124    pub severity: AlertSeverity,
125}
126
127/// Types of monitoring targets
128#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
129pub enum MonitorTargetType {
130    /// System-wide CPU monitoring
131    SystemCpu,
132    /// System-wide memory monitoring
133    SystemMemory,
134    /// GPU monitoring
135    Gpu,
136    /// Synthesis performance monitoring
137    SynthesisPerformance,
138    /// Queue depth monitoring
139    QueueDepth,
140    /// Error rate monitoring
141    ErrorRate,
142    /// Disk I/O monitoring
143    DiskIo,
144    /// Network I/O monitoring
145    NetworkIo,
146    /// Custom metric monitoring
147    Custom(String),
148}
149
150/// Alert severity levels
151#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
152pub enum AlertSeverity {
153    /// Informational alerts
154    Info,
155    /// Warning alerts
156    Warning,
157    /// Critical alerts (requires immediate attention)
158    Critical,
159    /// Emergency alerts (system may be unusable)
160    Emergency,
161}
162
163/// Auto-recovery configuration
164#[derive(Debug, Clone, Serialize, Deserialize)]
165pub struct AutoRecoveryConfig {
166    /// Enable automatic recovery actions
167    pub enabled: bool,
168    /// Recovery actions to attempt
169    pub actions: Vec<RecoveryAction>,
170    /// Maximum recovery attempts per hour
171    pub max_attempts_per_hour: usize,
172    /// Delay between recovery attempts
173    pub retry_delay: Duration,
174}
175
176/// Recovery action types
177#[derive(Debug, Clone, Serialize, Deserialize)]
178pub enum RecoveryAction {
179    /// Reduce batch size
180    ReduceBatchSize { min_size: usize },
181    /// Clear caches
182    ClearCaches,
183    /// Restart worker threads
184    RestartWorkers,
185    /// Reduce parallel processing
186    ReduceParallelism { min_threads: usize },
187    /// Enable memory optimization
188    EnableMemoryOptimization,
189    /// Switch to lower quality mode
190    ReduceQuality,
191    /// Pause processing temporarily
192    PauseProcessing { duration: Duration },
193    /// Custom command execution
194    CustomCommand { command: String, args: Vec<String> },
195}
196
197/// Performance alert
198#[derive(Debug, Clone, Serialize, Deserialize)]
199pub struct PerformanceAlert {
200    /// Alert ID
201    pub id: String,
202    /// Alert timestamp
203    pub timestamp: u64,
204    /// Alert severity
205    pub severity: AlertSeverity,
206    /// Target that triggered the alert
207    pub target: MonitorTargetType,
208    /// Alert message
209    pub message: String,
210    /// Current metric value
211    pub current_value: f64,
212    /// Threshold that was exceeded
213    pub threshold_value: f64,
214    /// Metric name
215    pub metric_name: String,
216    /// Additional context
217    pub context: HashMap<String, String>,
218    /// Whether the alert is resolved
219    pub resolved: bool,
220    /// Resolution timestamp
221    pub resolved_at: Option<u64>,
222}
223
224/// Alert manager for handling notifications and recovery
225#[derive(Debug)]
226pub struct AlertManager {
227    /// Configuration
228    config: AlertConfig,
229    /// Active alerts
230    active_alerts: Arc<RwLock<HashMap<String, PerformanceAlert>>>,
231    /// Alert history
232    alert_history: Arc<RwLock<Vec<PerformanceAlert>>>,
233    /// Alert cooldown tracking
234    alert_cooldowns: Arc<RwLock<HashMap<String, Instant>>>,
235    /// Alerts sent in current hour
236    hourly_alert_count: Arc<RwLock<usize>>,
237    /// Last hour reset time
238    last_hour_reset: Arc<RwLock<Instant>>,
239}
240
241/// Performance monitor
242pub struct PerformanceMonitor {
243    /// Monitor configuration
244    config: MonitorConfig,
245    /// Alert manager
246    alert_manager: AlertManager,
247    /// Current metrics
248    current_metrics: Arc<RwLock<Option<PerformanceMetrics>>>,
249    /// Monitoring status
250    is_running: Arc<RwLock<bool>>,
251    /// Metrics history for trend analysis
252    metrics_history: Arc<RwLock<Vec<PerformanceMetrics>>>,
253    /// Alert sender channel
254    alert_sender: mpsc::UnboundedSender<PerformanceAlert>,
255    /// Alert receiver channel
256    alert_receiver: Arc<RwLock<Option<mpsc::UnboundedReceiver<PerformanceAlert>>>>,
257    /// Shutdown signal
258    shutdown_sender: Option<watch::Sender<bool>>,
259}
260
261impl PerformanceMonitor {
262    /// Create a new performance monitor
263    pub fn new(config: MonitorConfig) -> Self {
264        let (alert_sender, alert_receiver) = mpsc::unbounded_channel();
265        let (shutdown_sender, _) = watch::channel(false);
266
267        Self {
268            alert_manager: AlertManager::new(config.alerts.clone()),
269            config,
270            current_metrics: Arc::new(RwLock::new(None)),
271            is_running: Arc::new(RwLock::new(false)),
272            metrics_history: Arc::new(RwLock::new(Vec::new())),
273            alert_sender,
274            alert_receiver: Arc::new(RwLock::new(Some(alert_receiver))),
275            shutdown_sender: Some(shutdown_sender),
276        }
277    }
278
279    /// Start monitoring
280    pub async fn start(&self) -> Result<(), Box<dyn std::error::Error>> {
281        let mut is_running = self.is_running.write().await;
282        if *is_running {
283            return Ok(());
284        }
285        *is_running = true;
286        drop(is_running);
287
288        tracing::info!(
289            "Starting performance monitor with interval: {:?}",
290            self.config.interval
291        );
292
293        // Start alert processing task
294        self.start_alert_processor().await?;
295
296        // Start monitoring loop
297        self.start_monitoring_loop().await?;
298
299        Ok(())
300    }
301
302    /// Stop monitoring
303    pub async fn stop(&self) -> Result<(), Box<dyn std::error::Error>> {
304        let mut is_running = self.is_running.write().await;
305        if !*is_running {
306            return Ok(());
307        }
308        *is_running = false;
309
310        // Send shutdown signal
311        if let Some(sender) = &self.shutdown_sender {
312            let _ = sender.send(true);
313        }
314
315        tracing::info!("Stopped performance monitor");
316        Ok(())
317    }
318
319    /// Update current metrics
320    pub async fn update_metrics(&self, metrics: PerformanceMetrics) {
321        // Store current metrics
322        let mut current = self.current_metrics.write().await;
323        *current = Some(metrics.clone());
324        drop(current);
325
326        // Add to history
327        let mut history = self.metrics_history.write().await;
328        history.push(metrics.clone());
329
330        // Maintain history size
331        let max_history =
332            (self.config.retention_duration.as_secs() / self.config.interval.as_secs()) as usize;
333        if history.len() > max_history {
334            history.remove(0);
335        }
336        drop(history);
337
338        // Check for alerts
339        self.check_alerts(&metrics).await;
340    }
341
342    /// Check metrics against alert thresholds
343    async fn check_alerts(&self, metrics: &PerformanceMetrics) {
344        for target in &self.config.targets {
345            if !target.enabled {
346                continue;
347            }
348
349            let thresholds = target
350                .custom_thresholds
351                .as_ref()
352                .unwrap_or(&self.config.thresholds);
353
354            if let Some(alert) = self.check_target_alerts(target, metrics, thresholds).await {
355                let _ = self.alert_sender.send(alert);
356            }
357        }
358    }
359
360    /// Check alerts for a specific target
361    async fn check_target_alerts(
362        &self,
363        target: &MonitorTarget,
364        metrics: &PerformanceMetrics,
365        thresholds: &AlertThresholds,
366    ) -> Option<PerformanceAlert> {
367        match target.target_type {
368            MonitorTargetType::SystemCpu => {
369                if let Some(threshold) = thresholds.cpu_usage_percent {
370                    if metrics.system.cpu_usage > threshold {
371                        return Some(
372                            self.create_alert(
373                                target,
374                                "High CPU usage detected",
375                                metrics.system.cpu_usage,
376                                threshold,
377                                "cpu_usage_percent",
378                            )
379                            .await,
380                        );
381                    }
382                }
383            }
384            MonitorTargetType::SystemMemory => {
385                let memory_usage_percent = (metrics.system.memory_used as f64
386                    / (metrics.system.memory_used + metrics.system.memory_available) as f64)
387                    * 100.0;
388
389                if let Some(threshold) = thresholds.memory_usage_percent {
390                    if memory_usage_percent > threshold {
391                        return Some(
392                            self.create_alert(
393                                target,
394                                "High memory usage detected",
395                                memory_usage_percent,
396                                threshold,
397                                "memory_usage_percent",
398                            )
399                            .await,
400                        );
401                    }
402                }
403            }
404            MonitorTargetType::Gpu => {
405                if let Some(ref gpu_metrics) = metrics.gpu {
406                    // Check GPU utilization
407                    if let Some(threshold) = thresholds.gpu_utilization_percent {
408                        if gpu_metrics.utilization > threshold {
409                            return Some(
410                                self.create_alert(
411                                    target,
412                                    "High GPU utilization detected",
413                                    gpu_metrics.utilization,
414                                    threshold,
415                                    "gpu_utilization_percent",
416                                )
417                                .await,
418                            );
419                        }
420                    }
421
422                    // Check GPU memory
423                    let gpu_memory_percent =
424                        (gpu_metrics.memory_used as f64 / gpu_metrics.memory_total as f64) * 100.0;
425                    if let Some(threshold) = thresholds.gpu_memory_percent {
426                        if gpu_memory_percent > threshold {
427                            return Some(
428                                self.create_alert(
429                                    target,
430                                    "High GPU memory usage detected",
431                                    gpu_memory_percent,
432                                    threshold,
433                                    "gpu_memory_percent",
434                                )
435                                .await,
436                            );
437                        }
438                    }
439
440                    // Check GPU temperature
441                    if let Some(threshold) = thresholds.max_gpu_temperature {
442                        if gpu_metrics.temperature > threshold {
443                            return Some(
444                                self.create_alert(
445                                    target,
446                                    "High GPU temperature detected",
447                                    gpu_metrics.temperature,
448                                    threshold,
449                                    "gpu_temperature",
450                                )
451                                .await,
452                            );
453                        }
454                    }
455                }
456            }
457            MonitorTargetType::SynthesisPerformance => {
458                // Check real-time factor
459                if let Some(threshold) = thresholds.min_real_time_factor {
460                    if metrics.synthesis.real_time_factor < threshold {
461                        return Some(
462                            self.create_alert(
463                                target,
464                                "Poor synthesis performance detected",
465                                metrics.synthesis.real_time_factor,
466                                threshold,
467                                "real_time_factor",
468                            )
469                            .await,
470                        );
471                    }
472                }
473
474                // Check synthesis time
475                if let Some(threshold) = thresholds.max_synthesis_time_ms {
476                    if metrics.synthesis.avg_synthesis_time_ms > threshold {
477                        return Some(
478                            self.create_alert(
479                                target,
480                                "High synthesis time detected",
481                                metrics.synthesis.avg_synthesis_time_ms,
482                                threshold,
483                                "synthesis_time_ms",
484                            )
485                            .await,
486                        );
487                    }
488                }
489            }
490            MonitorTargetType::QueueDepth => {
491                if let Some(threshold) = thresholds.max_queue_depth {
492                    if metrics.synthesis.queue_depth > threshold {
493                        return Some(
494                            self.create_alert(
495                                target,
496                                "High queue depth detected",
497                                metrics.synthesis.queue_depth as f64,
498                                threshold as f64,
499                                "queue_depth",
500                            )
501                            .await,
502                        );
503                    }
504                }
505            }
506            MonitorTargetType::ErrorRate => {
507                let error_rate = if metrics.synthesis.total_operations > 0 {
508                    (metrics.synthesis.failed_operations as f64
509                        / metrics.synthesis.total_operations as f64)
510                        * 100.0
511                } else {
512                    0.0
513                };
514
515                if let Some(threshold) = thresholds.max_error_rate_percent {
516                    if error_rate > threshold {
517                        return Some(
518                            self.create_alert(
519                                target,
520                                "High error rate detected",
521                                error_rate,
522                                threshold,
523                                "error_rate_percent",
524                            )
525                            .await,
526                        );
527                    }
528                }
529            }
530            MonitorTargetType::DiskIo => {
531                let total_disk_bps = metrics.system.disk_read_bps + metrics.system.disk_write_bps;
532                if let Some(threshold) = thresholds.max_network_bps {
533                    if total_disk_bps > threshold {
534                        return Some(
535                            self.create_alert(
536                                target,
537                                "High disk I/O detected",
538                                total_disk_bps as f64,
539                                threshold as f64,
540                                "disk_io_bps",
541                            )
542                            .await,
543                        );
544                    }
545                }
546            }
547            MonitorTargetType::NetworkIo => {
548                if let Some(threshold) = thresholds.max_network_bps {
549                    if metrics.system.network_bps > threshold {
550                        return Some(
551                            self.create_alert(
552                                target,
553                                "High network I/O detected",
554                                metrics.system.network_bps as f64,
555                                threshold as f64,
556                                "network_io_bps",
557                            )
558                            .await,
559                        );
560                    }
561                }
562            }
563            MonitorTargetType::Custom(_) => {
564                // Custom monitoring logic would go here
565            }
566        }
567
568        None
569    }
570
571    /// Create a performance alert
572    async fn create_alert(
573        &self,
574        target: &MonitorTarget,
575        message: &str,
576        current_value: f64,
577        threshold_value: f64,
578        metric_name: &str,
579    ) -> PerformanceAlert {
580        let timestamp = SystemTime::now()
581            .duration_since(UNIX_EPOCH)
582            .unwrap_or_default()
583            .as_secs();
584
585        let alert_id = format!(
586            "{}_{}_{}_{}",
587            target.name,
588            metric_name,
589            timestamp,
590            fastrand::u32(..)
591        );
592
593        let mut context = HashMap::new();
594        context.insert("target_name".to_string(), target.name.clone());
595        context.insert("metric_name".to_string(), metric_name.to_string());
596        context.insert("current_value".to_string(), current_value.to_string());
597        context.insert("threshold_value".to_string(), threshold_value.to_string());
598
599        PerformanceAlert {
600            id: alert_id,
601            timestamp,
602            severity: target.severity.clone(),
603            target: target.target_type.clone(),
604            message: message.to_string(),
605            current_value,
606            threshold_value,
607            metric_name: metric_name.to_string(),
608            context,
609            resolved: false,
610            resolved_at: None,
611        }
612    }
613
614    /// Start alert processing task
615    async fn start_alert_processor(&self) -> Result<(), Box<dyn std::error::Error>> {
616        let mut receiver = self.alert_receiver.write().await;
617        if let Some(rx) = receiver.take() {
618            let alert_manager = self.alert_manager.clone();
619            let auto_recovery = self.config.auto_recovery.clone();
620
621            tokio::spawn(async move {
622                let mut rx = rx;
623                while let Some(alert) = rx.recv().await {
624                    // Process the alert
625                    alert_manager.process_alert(alert.clone()).await;
626
627                    // Attempt auto-recovery if enabled
628                    if auto_recovery.enabled {
629                        Self::attempt_auto_recovery(&alert, &auto_recovery).await;
630                    }
631                }
632            });
633        }
634
635        Ok(())
636    }
637
638    /// Start monitoring loop
639    async fn start_monitoring_loop(&self) -> Result<(), Box<dyn std::error::Error>> {
640        let is_running = self.is_running.clone();
641        let interval = self.config.interval;
642
643        tokio::spawn(async move {
644            let mut interval_timer = tokio::time::interval(interval);
645
646            loop {
647                interval_timer.tick().await;
648
649                let running = is_running.read().await;
650                if !*running {
651                    break;
652                }
653                drop(running);
654
655                // Monitoring loop would collect metrics here
656                // For now, this is a placeholder since metrics come from external sources
657                tracing::debug!("Performance monitoring tick");
658            }
659        });
660
661        Ok(())
662    }
663
664    /// Attempt automatic recovery
665    async fn attempt_auto_recovery(alert: &PerformanceAlert, config: &AutoRecoveryConfig) {
666        for action in &config.actions {
667            match action {
668                RecoveryAction::ReduceBatchSize { min_size } => {
669                    tracing::info!("Auto-recovery: Reducing batch size (min: {})", min_size);
670                    // Implementation would reduce batch size in the synthesis system
671                }
672                RecoveryAction::ClearCaches => {
673                    tracing::info!("Auto-recovery: Clearing caches");
674                    // Implementation would clear system caches
675                }
676                RecoveryAction::RestartWorkers => {
677                    tracing::info!("Auto-recovery: Restarting worker threads");
678                    // Implementation would restart worker thread pool
679                }
680                RecoveryAction::ReduceParallelism { min_threads } => {
681                    tracing::info!("Auto-recovery: Reducing parallelism (min: {})", min_threads);
682                    // Implementation would reduce thread count
683                }
684                RecoveryAction::EnableMemoryOptimization => {
685                    tracing::info!("Auto-recovery: Enabling memory optimization");
686                    // Implementation would enable memory optimization features
687                }
688                RecoveryAction::ReduceQuality => {
689                    tracing::info!("Auto-recovery: Reducing synthesis quality");
690                    // Implementation would reduce quality settings
691                }
692                RecoveryAction::PauseProcessing { duration } => {
693                    tracing::info!("Auto-recovery: Pausing processing for {:?}", duration);
694                    tokio::time::sleep(*duration).await;
695                }
696                RecoveryAction::CustomCommand { command, args } => {
697                    tracing::info!(
698                        "Auto-recovery: Executing custom command: {} {:?}",
699                        command,
700                        args
701                    );
702                    // Implementation would execute custom recovery command
703                }
704            }
705
706            // Wait between recovery actions
707            tokio::time::sleep(config.retry_delay).await;
708        }
709    }
710
711    /// Get current monitoring status
712    pub async fn is_running(&self) -> bool {
713        *self.is_running.read().await
714    }
715
716    /// Get current metrics
717    pub async fn get_current_metrics(&self) -> Option<PerformanceMetrics> {
718        self.current_metrics.read().await.clone()
719    }
720
721    /// Get active alerts
722    pub async fn get_active_alerts(&self) -> Vec<PerformanceAlert> {
723        self.alert_manager.get_active_alerts().await
724    }
725
726    /// Get alert history
727    pub async fn get_alert_history(&self, limit: Option<usize>) -> Vec<PerformanceAlert> {
728        self.alert_manager.get_alert_history(limit).await
729    }
730
731    /// Update monitor configuration
732    pub async fn update_config(&mut self, config: MonitorConfig) {
733        self.config = config;
734    }
735}
736
737impl AlertManager {
738    /// Create a new alert manager
739    pub fn new(config: AlertConfig) -> Self {
740        Self {
741            config,
742            active_alerts: Arc::new(RwLock::new(HashMap::new())),
743            alert_history: Arc::new(RwLock::new(Vec::new())),
744            alert_cooldowns: Arc::new(RwLock::new(HashMap::new())),
745            hourly_alert_count: Arc::new(RwLock::new(0)),
746            last_hour_reset: Arc::new(RwLock::new(Instant::now())),
747        }
748    }
749
750    /// Process a performance alert
751    async fn process_alert(&self, alert: PerformanceAlert) {
752        // Check cooldown
753        if self.is_in_cooldown(&alert).await {
754            return;
755        }
756
757        // Check hourly limit
758        if !self.can_send_alert().await {
759            tracing::warn!(
760                "Alert rate limit exceeded, skipping alert: {}",
761                alert.message
762            );
763            return;
764        }
765
766        // Add to active alerts
767        let mut active = self.active_alerts.write().await;
768        active.insert(alert.id.clone(), alert.clone());
769        drop(active);
770
771        // Add to history
772        let mut history = self.alert_history.write().await;
773        history.push(alert.clone());
774        drop(history);
775
776        // Set cooldown
777        self.set_cooldown(&alert).await;
778
779        // Send notifications
780        self.send_notifications(&alert).await;
781
782        // Increment hourly count
783        self.increment_hourly_count().await;
784    }
785
786    /// Check if alert is in cooldown period
787    async fn is_in_cooldown(&self, alert: &PerformanceAlert) -> bool {
788        let cooldowns = self.alert_cooldowns.read().await;
789        if let Some(&last_sent) = cooldowns.get(&alert.metric_name) {
790            last_sent.elapsed() < self.config.cooldown_duration
791        } else {
792            false
793        }
794    }
795
796    /// Set cooldown for alert type
797    async fn set_cooldown(&self, alert: &PerformanceAlert) {
798        let mut cooldowns = self.alert_cooldowns.write().await;
799        cooldowns.insert(alert.metric_name.clone(), Instant::now());
800    }
801
802    /// Check if we can send more alerts this hour
803    async fn can_send_alert(&self) -> bool {
804        // Reset hourly count if needed
805        let mut last_reset = self.last_hour_reset.write().await;
806        if last_reset.elapsed() >= Duration::from_secs(3600) {
807            *last_reset = Instant::now();
808            let mut count = self.hourly_alert_count.write().await;
809            *count = 0;
810        }
811        drop(last_reset);
812
813        let count = self.hourly_alert_count.read().await;
814        *count < self.config.max_alerts_per_hour
815    }
816
817    /// Increment hourly alert count
818    async fn increment_hourly_count(&self) {
819        let mut count = self.hourly_alert_count.write().await;
820        *count += 1;
821    }
822
823    /// Send alert notifications
824    async fn send_notifications(&self, alert: &PerformanceAlert) {
825        // Console logging
826        if self.config.console_logging {
827            match alert.severity {
828                AlertSeverity::Info => {
829                    tracing::info!("ALERT [{}]: {}", alert.severity_string(), alert.message)
830                }
831                AlertSeverity::Warning => {
832                    tracing::warn!("ALERT [{}]: {}", alert.severity_string(), alert.message)
833                }
834                AlertSeverity::Critical => {
835                    tracing::error!("ALERT [{}]: {}", alert.severity_string(), alert.message)
836                }
837                AlertSeverity::Emergency => {
838                    tracing::error!("ALERT [{}]: {}", alert.severity_string(), alert.message)
839                }
840            }
841        }
842
843        // File logging
844        if let Some(ref log_path) = self.config.file_logging {
845            let log_entry = format!(
846                "{} [{}] {}: {} (current: {:.2}, threshold: {:.2})\n",
847                alert.timestamp,
848                alert.severity_string(),
849                alert.metric_name,
850                alert.message,
851                alert.current_value,
852                alert.threshold_value
853            );
854
855            if let Err(e) = tokio::fs::write(log_path, log_entry).await {
856                tracing::error!("Failed to write alert to log file: {}", e);
857            }
858        }
859
860        // Email notifications
861        if let Some(ref email_config) = self.config.email_notifications {
862            if let Err(e) = self.send_email_alert(alert, email_config).await {
863                tracing::error!("Failed to send email alert: {}", e);
864            }
865        }
866
867        // Webhook notifications
868        if let Some(ref webhook_config) = self.config.webhook_notifications {
869            if let Err(e) = self.send_webhook_alert(alert, webhook_config).await {
870                tracing::error!("Failed to send webhook alert: {}", e);
871            }
872        }
873    }
874
875    /// Send email alert
876    async fn send_email_alert(
877        &self,
878        alert: &PerformanceAlert,
879        config: &EmailConfig,
880    ) -> Result<(), Box<dyn std::error::Error>> {
881        // Email sending implementation would go here
882        // For now, just log the attempt
883        tracing::info!(
884            "Would send email alert to {:?}: {}",
885            config.to_emails,
886            alert.message
887        );
888        Ok(())
889    }
890
891    /// Send webhook alert
892    async fn send_webhook_alert(
893        &self,
894        alert: &PerformanceAlert,
895        config: &WebhookConfig,
896    ) -> Result<(), Box<dyn std::error::Error>> {
897        let client = reqwest::Client::new();
898        let payload = serde_json::to_value(alert)?;
899
900        for attempt in 0..=config.retry_attempts {
901            let mut request = match config.method.to_uppercase().as_str() {
902                "POST" => client.post(&config.url),
903                "PUT" => client.put(&config.url),
904                "PATCH" => client.patch(&config.url),
905                _ => client.get(&config.url),
906            };
907
908            // Add headers
909            for (key, value) in &config.headers {
910                request = request.header(key, value);
911            }
912
913            // Add JSON body for POST/PUT/PATCH
914            if matches!(
915                config.method.to_uppercase().as_str(),
916                "POST" | "PUT" | "PATCH"
917            ) {
918                request = request.json(&payload);
919            }
920
921            let response = request
922                .timeout(Duration::from_secs(config.timeout_seconds))
923                .send()
924                .await;
925
926            match response {
927                Ok(resp) if resp.status().is_success() => {
928                    tracing::info!("Webhook alert sent successfully to {}", config.url);
929                    return Ok(());
930                }
931                Ok(resp) => {
932                    tracing::warn!(
933                        "Webhook alert failed with status {}: {}",
934                        resp.status(),
935                        config.url
936                    );
937                }
938                Err(e) => {
939                    tracing::warn!("Webhook alert attempt {} failed: {}", attempt + 1, e);
940                }
941            }
942
943            if attempt < config.retry_attempts {
944                tokio::time::sleep(Duration::from_secs(2_u64.pow(attempt as u32))).await;
945            }
946        }
947
948        Err("All webhook attempts failed".into())
949    }
950
951    /// Get active alerts
952    async fn get_active_alerts(&self) -> Vec<PerformanceAlert> {
953        let active = self.active_alerts.read().await;
954        active.values().cloned().collect()
955    }
956
957    /// Get alert history
958    async fn get_alert_history(&self, limit: Option<usize>) -> Vec<PerformanceAlert> {
959        let history = self.alert_history.read().await;
960        if let Some(limit) = limit {
961            history.iter().rev().take(limit).cloned().collect()
962        } else {
963            history.clone()
964        }
965    }
966
967    /// Resolve an alert
968    pub async fn resolve_alert(&self, alert_id: &str) -> bool {
969        let mut active = self.active_alerts.write().await;
970        if let Some(mut alert) = active.remove(alert_id) {
971            alert.resolved = true;
972            alert.resolved_at = Some(
973                SystemTime::now()
974                    .duration_since(UNIX_EPOCH)
975                    .unwrap_or_default()
976                    .as_secs(),
977            );
978
979            // Update in history
980            let mut history = self.alert_history.write().await;
981            if let Some(hist_alert) = history.iter_mut().find(|a| a.id == alert_id) {
982                hist_alert.resolved = true;
983                hist_alert.resolved_at = alert.resolved_at;
984            }
985
986            true
987        } else {
988            false
989        }
990    }
991}
992
993impl Clone for AlertManager {
994    fn clone(&self) -> Self {
995        Self {
996            config: self.config.clone(),
997            active_alerts: self.active_alerts.clone(),
998            alert_history: self.alert_history.clone(),
999            alert_cooldowns: self.alert_cooldowns.clone(),
1000            hourly_alert_count: self.hourly_alert_count.clone(),
1001            last_hour_reset: self.last_hour_reset.clone(),
1002        }
1003    }
1004}
1005
1006impl PerformanceAlert {
1007    /// Get severity as string
1008    pub fn severity_string(&self) -> &'static str {
1009        match self.severity {
1010            AlertSeverity::Info => "INFO",
1011            AlertSeverity::Warning => "WARNING",
1012            AlertSeverity::Critical => "CRITICAL",
1013            AlertSeverity::Emergency => "EMERGENCY",
1014        }
1015    }
1016}
1017
1018impl Default for MonitorConfig {
1019    fn default() -> Self {
1020        Self {
1021            interval: Duration::from_secs(10),
1022            enabled: false,
1023            thresholds: AlertThresholds::default(),
1024            alerts: AlertConfig::default(),
1025            targets: vec![
1026                MonitorTarget {
1027                    name: "system_cpu".to_string(),
1028                    target_type: MonitorTargetType::SystemCpu,
1029                    enabled: true,
1030                    custom_thresholds: None,
1031                    severity: AlertSeverity::Warning,
1032                },
1033                MonitorTarget {
1034                    name: "system_memory".to_string(),
1035                    target_type: MonitorTargetType::SystemMemory,
1036                    enabled: true,
1037                    custom_thresholds: None,
1038                    severity: AlertSeverity::Warning,
1039                },
1040                MonitorTarget {
1041                    name: "synthesis_performance".to_string(),
1042                    target_type: MonitorTargetType::SynthesisPerformance,
1043                    enabled: true,
1044                    custom_thresholds: None,
1045                    severity: AlertSeverity::Critical,
1046                },
1047            ],
1048            retention_duration: Duration::from_secs(3600), // 1 hour
1049            auto_recovery: AutoRecoveryConfig::default(),
1050        }
1051    }
1052}
1053
1054impl Default for AlertThresholds {
1055    fn default() -> Self {
1056        Self {
1057            cpu_usage_percent: Some(80.0),
1058            memory_usage_percent: Some(85.0),
1059            gpu_utilization_percent: Some(95.0),
1060            gpu_memory_percent: Some(90.0),
1061            min_real_time_factor: Some(0.8),
1062            max_synthesis_time_ms: Some(5000.0),
1063            max_queue_depth: Some(20),
1064            min_success_rate_percent: Some(90.0),
1065            max_error_rate_percent: Some(10.0),
1066            max_gpu_temperature: Some(85.0),
1067            max_disk_usage_percent: Some(90.0),
1068            max_network_bps: Some(1_000_000_000), // 1 GB/s
1069        }
1070    }
1071}
1072
1073impl Default for AlertConfig {
1074    fn default() -> Self {
1075        Self {
1076            console_logging: true,
1077            file_logging: None,
1078            email_notifications: None,
1079            webhook_notifications: None,
1080            cooldown_duration: Duration::from_secs(300), // 5 minutes
1081            max_alerts_per_hour: 20,
1082        }
1083    }
1084}
1085
1086impl Default for AutoRecoveryConfig {
1087    fn default() -> Self {
1088        Self {
1089            enabled: false,
1090            actions: vec![
1091                RecoveryAction::ReduceBatchSize { min_size: 8 },
1092                RecoveryAction::ClearCaches,
1093                RecoveryAction::ReduceParallelism { min_threads: 2 },
1094            ],
1095            max_attempts_per_hour: 5,
1096            retry_delay: Duration::from_secs(30),
1097        }
1098    }
1099}
1100
1101#[cfg(test)]
1102mod tests {
1103    use super::*;
1104
1105    #[tokio::test]
1106    async fn test_performance_monitor_creation() {
1107        let config = MonitorConfig::default();
1108        let monitor = PerformanceMonitor::new(config);
1109        assert!(!monitor.is_running().await);
1110    }
1111
1112    #[tokio::test]
1113    async fn test_alert_creation() {
1114        let config = MonitorConfig::default();
1115        let monitor = PerformanceMonitor::new(config);
1116
1117        let target = MonitorTarget {
1118            name: "test_cpu".to_string(),
1119            target_type: MonitorTargetType::SystemCpu,
1120            enabled: true,
1121            custom_thresholds: None,
1122            severity: AlertSeverity::Warning,
1123        };
1124
1125        let alert = monitor
1126            .create_alert(
1127                &target,
1128                "Test alert message",
1129                85.0,
1130                80.0,
1131                "cpu_usage_percent",
1132            )
1133            .await;
1134
1135        assert_eq!(alert.severity, AlertSeverity::Warning);
1136        assert_eq!(alert.current_value, 85.0);
1137        assert_eq!(alert.threshold_value, 80.0);
1138        assert!(!alert.resolved);
1139    }
1140
1141    #[tokio::test]
1142    async fn test_alert_manager() {
1143        let config = AlertConfig::default();
1144        let manager = AlertManager::new(config);
1145
1146        let mut alert = PerformanceAlert {
1147            id: "test_alert".to_string(),
1148            timestamp: 1234567890,
1149            severity: AlertSeverity::Warning,
1150            target: MonitorTargetType::SystemCpu,
1151            message: "Test alert".to_string(),
1152            current_value: 85.0,
1153            threshold_value: 80.0,
1154            metric_name: "cpu_usage".to_string(),
1155            context: HashMap::new(),
1156            resolved: false,
1157            resolved_at: None,
1158        };
1159
1160        // Process alert
1161        manager.process_alert(alert.clone()).await;
1162
1163        // Check active alerts
1164        let active = manager.get_active_alerts().await;
1165        assert_eq!(active.len(), 1);
1166        assert_eq!(active[0].id, "test_alert");
1167
1168        // Resolve alert
1169        let resolved = manager.resolve_alert("test_alert").await;
1170        assert!(resolved);
1171
1172        // Check active alerts again
1173        let active = manager.get_active_alerts().await;
1174        assert_eq!(active.len(), 0);
1175    }
1176
1177    #[test]
1178    fn test_alert_thresholds_default() {
1179        let thresholds = AlertThresholds::default();
1180        assert_eq!(thresholds.cpu_usage_percent, Some(80.0));
1181        assert_eq!(thresholds.memory_usage_percent, Some(85.0));
1182        assert_eq!(thresholds.min_real_time_factor, Some(0.8));
1183    }
1184
1185    #[test]
1186    fn test_recovery_actions() {
1187        let action = RecoveryAction::ReduceBatchSize { min_size: 8 };
1188        match action {
1189            RecoveryAction::ReduceBatchSize { min_size } => {
1190                assert_eq!(min_size, 8);
1191            }
1192            _ => panic!("Wrong action type"),
1193        }
1194    }
1195
1196    #[tokio::test]
1197    async fn test_monitor_start_stop() {
1198        let config = MonitorConfig::default();
1199        let mut monitor = PerformanceMonitor::new(config);
1200
1201        assert!(!monitor.is_running().await);
1202
1203        monitor.start().await.unwrap();
1204        assert!(monitor.is_running().await);
1205
1206        monitor.stop().await.unwrap();
1207        assert!(!monitor.is_running().await);
1208    }
1209}