Skip to main content

cuda_rust_wasm/nutanix/
monitoring.rs

1//! GPU monitoring and telemetry via Nutanix Prism Central
2//!
3//! Provides real-time GPU metrics collection, health assessment, utilization
4//! history tracking, and capacity forecasting for cuda-wasm workloads running
5//! on Nutanix clusters.
6
7#[cfg(feature = "serde")]
8use serde::{Deserialize, Serialize};
9
10use crate::error::CudaRustError;
11use super::config::NutanixConfig;
12
13/// GPU metrics snapshot for a single GPU device
14#[derive(Debug, Clone)]
15#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
16pub struct GpuMetrics {
17    /// GPU utilization percentage (0-100)
18    pub utilization_percent: f64,
19    /// GPU memory currently in use (bytes)
20    pub memory_used_bytes: u64,
21    /// Total GPU memory (bytes)
22    pub memory_total_bytes: u64,
23    /// GPU temperature in Celsius
24    pub temperature_celsius: f64,
25    /// GPU power draw in Watts
26    pub power_watts: f64,
27    /// GPU core clock speed in MHz
28    pub clock_speed_mhz: u32,
29    /// Fan speed percentage (0-100)
30    pub fan_speed_percent: f64,
31    /// ECC error count (single-bit + double-bit)
32    pub ecc_errors: u64,
33}
34
35impl GpuMetrics {
36    /// Memory utilization as a percentage
37    pub fn memory_utilization_percent(&self) -> f64 {
38        if self.memory_total_bytes == 0 {
39            return 0.0;
40        }
41        (self.memory_used_bytes as f64 / self.memory_total_bytes as f64) * 100.0
42    }
43
44    /// Whether the GPU is thermally throttling (above 85C)
45    pub fn is_throttling(&self) -> bool {
46        self.temperature_celsius > 85.0
47    }
48}
49
50/// Alert severity levels
51#[derive(Debug, Clone, PartialEq, Eq)]
52#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
53pub enum AlertSeverity {
54    /// Informational alert
55    Info,
56    /// Warning - requires attention
57    Warning,
58    /// Critical - immediate action needed
59    Critical,
60}
61
62impl std::fmt::Display for AlertSeverity {
63    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
64        match self {
65            AlertSeverity::Info => write!(f, "INFO"),
66            AlertSeverity::Warning => write!(f, "WARNING"),
67            AlertSeverity::Critical => write!(f, "CRITICAL"),
68        }
69    }
70}
71
72/// An alert generated from GPU metric analysis
73#[derive(Debug, Clone)]
74#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
75pub struct Alert {
76    /// Alert severity
77    pub severity: AlertSeverity,
78    /// Human-readable alert message
79    pub message: String,
80    /// Unix timestamp (seconds) when the alert was generated
81    pub timestamp: u64,
82    /// GPU device ID that triggered the alert (if applicable)
83    pub gpu_id: Option<String>,
84}
85
86/// Overall health status for a node
87#[derive(Debug, Clone, PartialEq, Eq)]
88#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
89pub enum HealthStatus {
90    /// All GPUs operating normally
91    Healthy,
92    /// Some GPUs have warnings (high temp, high utilization)
93    Warning,
94    /// One or more GPUs have critical issues
95    Critical,
96}
97
98impl std::fmt::Display for HealthStatus {
99    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
100        match self {
101            HealthStatus::Healthy => write!(f, "HEALTHY"),
102            HealthStatus::Warning => write!(f, "WARNING"),
103            HealthStatus::Critical => write!(f, "CRITICAL"),
104        }
105    }
106}
107
108/// Health assessment for a GPU node
109#[derive(Debug, Clone)]
110#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
111pub struct NodeHealth {
112    /// Node UUID
113    pub node_id: String,
114    /// Overall health status
115    pub overall_health: HealthStatus,
116    /// Per-GPU metrics (keyed by GPU device ID)
117    pub gpu_metrics: Vec<(String, GpuMetrics)>,
118    /// Active alerts
119    pub alerts: Vec<Alert>,
120}
121
122/// Capacity forecast for a cluster
123#[derive(Debug, Clone)]
124#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
125pub struct CapacityForecast {
126    /// Cluster ID
127    pub cluster_id: String,
128    /// Current average GPU utilization across the cluster
129    pub current_utilization_percent: f64,
130    /// Projected utilization at the forecast horizon
131    pub projected_utilization_percent: f64,
132    /// Hours until capacity is projected to reach 90%
133    pub hours_until_90_percent: Option<u32>,
134    /// Hours until capacity is projected to reach 100%
135    pub hours_until_full: Option<u32>,
136    /// Recommended action based on the forecast
137    pub recommendation: String,
138}
139
140/// GPU monitoring client for collecting metrics from Nutanix clusters
141pub struct GpuMonitor {
142    /// Prism Central connection configuration
143    #[allow(dead_code)]
144    config: NutanixConfig,
145
146    /// HTTP client (when nutanix feature is available)
147    #[cfg(feature = "nutanix")]
148    #[allow(dead_code)]
149    client: reqwest::Client,
150}
151
152impl GpuMonitor {
153    /// Create a new GpuMonitor with the given configuration
154    pub fn new(config: NutanixConfig) -> Result<Self, CudaRustError> {
155        #[cfg(feature = "nutanix")]
156        {
157            let builder = reqwest::Client::builder().timeout(config.timeout);
158            let client = builder.build().map_err(|e| {
159                CudaRustError::RuntimeError(format!("Failed to create HTTP client: {}", e))
160            })?;
161            Ok(Self { config, client })
162        }
163
164        #[cfg(not(feature = "nutanix"))]
165        {
166            Ok(Self { config })
167        }
168    }
169
170    /// Collect current GPU metrics for all GPUs on a node
171    ///
172    /// Polls the Prism Central API for the latest GPU telemetry data.
173    pub async fn collect_metrics(
174        &self,
175        node_id: &str,
176    ) -> Result<Vec<GpuMetrics>, CudaRustError> {
177        #[cfg(feature = "nutanix")]
178        {
179            let _ = node_id;
180            Err(CudaRustError::RuntimeError(
181                "Live metrics collection requires Prism Central connection".to_string(),
182            ))
183        }
184
185        #[cfg(not(feature = "nutanix"))]
186        {
187            Ok(self.local_metrics(node_id))
188        }
189    }
190
191    /// Perform a health assessment of a node's GPUs
192    ///
193    /// Collects metrics and evaluates them against thresholds to determine
194    /// overall node health and generate any alerts.
195    pub async fn check_health(
196        &self,
197        node_id: &str,
198    ) -> Result<NodeHealth, CudaRustError> {
199        let metrics = self.collect_metrics(node_id).await?;
200        let now = std::time::SystemTime::now()
201            .duration_since(std::time::UNIX_EPOCH)
202            .unwrap_or_default()
203            .as_secs();
204
205        let mut alerts = Vec::new();
206        let mut worst_health = HealthStatus::Healthy;
207
208        let gpu_metrics: Vec<(String, GpuMetrics)> = metrics
209            .into_iter()
210            .enumerate()
211            .map(|(i, m)| {
212                let gpu_id = format!("{}-gpu-{}", node_id, i);
213
214                // Temperature checks
215                if m.temperature_celsius > 90.0 {
216                    alerts.push(Alert {
217                        severity: AlertSeverity::Critical,
218                        message: format!(
219                            "GPU {} temperature critical: {:.1}C",
220                            gpu_id, m.temperature_celsius
221                        ),
222                        timestamp: now,
223                        gpu_id: Some(gpu_id.clone()),
224                    });
225                    worst_health = HealthStatus::Critical;
226                } else if m.temperature_celsius > 80.0 {
227                    alerts.push(Alert {
228                        severity: AlertSeverity::Warning,
229                        message: format!(
230                            "GPU {} temperature high: {:.1}C",
231                            gpu_id, m.temperature_celsius
232                        ),
233                        timestamp: now,
234                        gpu_id: Some(gpu_id.clone()),
235                    });
236                    if worst_health != HealthStatus::Critical {
237                        worst_health = HealthStatus::Warning;
238                    }
239                }
240
241                // Memory utilization checks
242                let mem_pct = m.memory_utilization_percent();
243                if mem_pct > 95.0 {
244                    alerts.push(Alert {
245                        severity: AlertSeverity::Critical,
246                        message: format!(
247                            "GPU {} memory nearly exhausted: {:.1}%",
248                            gpu_id, mem_pct
249                        ),
250                        timestamp: now,
251                        gpu_id: Some(gpu_id.clone()),
252                    });
253                    worst_health = HealthStatus::Critical;
254                } else if mem_pct > 85.0 {
255                    alerts.push(Alert {
256                        severity: AlertSeverity::Warning,
257                        message: format!(
258                            "GPU {} memory utilization high: {:.1}%",
259                            gpu_id, mem_pct
260                        ),
261                        timestamp: now,
262                        gpu_id: Some(gpu_id.clone()),
263                    });
264                    if worst_health != HealthStatus::Critical {
265                        worst_health = HealthStatus::Warning;
266                    }
267                }
268
269                // ECC error checks
270                if m.ecc_errors > 0 {
271                    let severity = if m.ecc_errors > 10 {
272                        worst_health = HealthStatus::Critical;
273                        AlertSeverity::Critical
274                    } else {
275                        if worst_health != HealthStatus::Critical {
276                            worst_health = HealthStatus::Warning;
277                        }
278                        AlertSeverity::Warning
279                    };
280                    alerts.push(Alert {
281                        severity,
282                        message: format!(
283                            "GPU {} has {} ECC errors",
284                            gpu_id, m.ecc_errors
285                        ),
286                        timestamp: now,
287                        gpu_id: Some(gpu_id.clone()),
288                    });
289                }
290
291                (gpu_id, m)
292            })
293            .collect();
294
295        Ok(NodeHealth {
296            node_id: node_id.to_string(),
297            overall_health: worst_health,
298            gpu_metrics,
299            alerts,
300        })
301    }
302
303    /// Retrieve utilization history for GPUs on a node
304    ///
305    /// Returns timestamped metric snapshots over the requested duration.
306    pub async fn get_utilization_history(
307        &self,
308        node_id: &str,
309        duration_minutes: u32,
310    ) -> Result<Vec<(u64, GpuMetrics)>, CudaRustError> {
311        #[cfg(feature = "nutanix")]
312        {
313            let _ = (node_id, duration_minutes);
314            Err(CudaRustError::RuntimeError(
315                "History collection requires Prism Central connection".to_string(),
316            ))
317        }
318
319        #[cfg(not(feature = "nutanix"))]
320        {
321            Ok(self.local_utilization_history(node_id, duration_minutes))
322        }
323    }
324
325    /// Predict future capacity usage for a cluster using linear projection
326    ///
327    /// Analyzes recent utilization trends to estimate when the cluster
328    /// will reach capacity thresholds (90% and 100%).
329    pub async fn predict_capacity(
330        &self,
331        cluster_id: &str,
332        hours_ahead: u32,
333    ) -> Result<CapacityForecast, CudaRustError> {
334        #[cfg(feature = "nutanix")]
335        {
336            let _ = (cluster_id, hours_ahead);
337            Err(CudaRustError::RuntimeError(
338                "Capacity prediction requires Prism Central connection".to_string(),
339            ))
340        }
341
342        #[cfg(not(feature = "nutanix"))]
343        {
344            Ok(self.local_capacity_forecast(cluster_id, hours_ahead))
345        }
346    }
347
348    // --- Local system probing for non-nutanix builds ---
349
350    /// Collect GPU metrics by querying `nvidia-smi` on the local system.
351    ///
352    /// Returns an empty vector if no NVIDIA GPUs or nvidia-smi is available.
353    #[cfg(not(feature = "nutanix"))]
354    fn local_metrics(&self, _node_id: &str) -> Vec<GpuMetrics> {
355        if let Ok(output) = std::process::Command::new("nvidia-smi")
356            .args([
357                "--query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw,clocks.current.graphics,fan.speed",
358                "--format=csv,noheader,nounits",
359            ])
360            .output()
361        {
362            if output.status.success() {
363                let stdout = String::from_utf8_lossy(&output.stdout);
364                return stdout
365                    .lines()
366                    .filter_map(|line| {
367                        let parts: Vec<&str> = line.split(", ").collect();
368                        if parts.len() >= 7 {
369                            Some(GpuMetrics {
370                                utilization_percent: parts[0]
371                                    .trim()
372                                    .parse()
373                                    .unwrap_or(0.0),
374                                memory_used_bytes: parts[1]
375                                    .trim()
376                                    .parse::<u64>()
377                                    .unwrap_or(0)
378                                    * 1024
379                                    * 1024,
380                                memory_total_bytes: parts[2]
381                                    .trim()
382                                    .parse::<u64>()
383                                    .unwrap_or(0)
384                                    * 1024
385                                    * 1024,
386                                temperature_celsius: parts[3]
387                                    .trim()
388                                    .parse()
389                                    .unwrap_or(0.0),
390                                power_watts: parts[4]
391                                    .trim()
392                                    .parse()
393                                    .unwrap_or(0.0),
394                                clock_speed_mhz: parts[5]
395                                    .trim()
396                                    .parse()
397                                    .unwrap_or(0),
398                                fan_speed_percent: parts[6]
399                                    .trim()
400                                    .parse()
401                                    .unwrap_or(0.0),
402                                ecc_errors: 0,
403                            })
404                        } else {
405                            None
406                        }
407                    })
408                    .collect();
409            }
410        }
411
412        // No GPU metrics available
413        Vec::new()
414    }
415
416    /// Return utilization history as a single current-time snapshot.
417    ///
418    /// Without a time-series database we cannot provide true history,
419    /// so we return one data point at the current timestamp per GPU.
420    #[cfg(not(feature = "nutanix"))]
421    fn local_utilization_history(
422        &self,
423        node_id: &str,
424        _duration_minutes: u32,
425    ) -> Vec<(u64, GpuMetrics)> {
426        let now = std::time::SystemTime::now()
427            .duration_since(std::time::UNIX_EPOCH)
428            .unwrap_or_default()
429            .as_secs();
430        self.local_metrics(node_id)
431            .into_iter()
432            .map(|m| (now, m))
433            .collect()
434    }
435
436    /// Generate a capacity forecast based on current local GPU utilization.
437    ///
438    /// Uses a simple linear projection with a 0.5%/hour growth assumption.
439    /// Returns a 0% utilization forecast when no GPU metrics are available.
440    #[cfg(not(feature = "nutanix"))]
441    fn local_capacity_forecast(
442        &self,
443        cluster_id: &str,
444        hours_ahead: u32,
445    ) -> CapacityForecast {
446        let metrics = self.local_metrics(cluster_id);
447        let current_util = metrics
448            .first()
449            .map(|m| m.utilization_percent)
450            .unwrap_or(0.0);
451        let growth_rate = 0.5; // 0.5% per hour assumption
452        let projected =
453            (current_util + growth_rate * hours_ahead as f64).min(100.0);
454
455        CapacityForecast {
456            cluster_id: cluster_id.to_string(),
457            current_utilization_percent: current_util,
458            projected_utilization_percent: projected,
459            hours_until_90_percent: if current_util < 90.0 {
460                Some(((90.0 - current_util) / growth_rate) as u32)
461            } else {
462                Some(0)
463            },
464            hours_until_full: if current_util < 100.0 {
465                Some(((100.0 - current_util) / growth_rate) as u32)
466            } else {
467                Some(0)
468            },
469            recommendation: if projected > 90.0 {
470                "Consider adding GPU nodes".to_string()
471            } else if projected > 75.0 {
472                "Monitor closely".to_string()
473            } else {
474                "Capacity sufficient".to_string()
475            },
476        }
477    }
478}
479
480#[cfg(test)]
481mod tests {
482    use super::*;
483
484    fn make_monitor() -> GpuMonitor {
485        let config = NutanixConfig::new("https://prism.example.com:9440", "test-key");
486        GpuMonitor::new(config).unwrap()
487    }
488
489    #[test]
490    fn test_gpu_metrics_memory_utilization() {
491        let m = GpuMetrics {
492            utilization_percent: 50.0,
493            memory_used_bytes: 40 * 1024 * 1024 * 1024,
494            memory_total_bytes: 80 * 1024 * 1024 * 1024,
495            temperature_celsius: 70.0,
496            power_watts: 250.0,
497            clock_speed_mhz: 1400,
498            fan_speed_percent: 50.0,
499            ecc_errors: 0,
500        };
501        let pct = m.memory_utilization_percent();
502        assert!((pct - 50.0).abs() < 0.01);
503    }
504
505    #[test]
506    fn test_gpu_metrics_throttling() {
507        let normal = GpuMetrics {
508            utilization_percent: 80.0,
509            memory_used_bytes: 0,
510            memory_total_bytes: 80 * 1024 * 1024 * 1024,
511            temperature_celsius: 75.0,
512            power_watts: 250.0,
513            clock_speed_mhz: 1400,
514            fan_speed_percent: 50.0,
515            ecc_errors: 0,
516        };
517        assert!(!normal.is_throttling());
518
519        let hot = GpuMetrics {
520            temperature_celsius: 92.0,
521            ..normal
522        };
523        assert!(hot.is_throttling());
524    }
525
526    #[test]
527    fn test_alert_severity_display() {
528        assert_eq!(AlertSeverity::Info.to_string(), "INFO");
529        assert_eq!(AlertSeverity::Warning.to_string(), "WARNING");
530        assert_eq!(AlertSeverity::Critical.to_string(), "CRITICAL");
531    }
532
533    #[test]
534    fn test_health_status_display() {
535        assert_eq!(HealthStatus::Healthy.to_string(), "HEALTHY");
536        assert_eq!(HealthStatus::Warning.to_string(), "WARNING");
537        assert_eq!(HealthStatus::Critical.to_string(), "CRITICAL");
538    }
539
540    #[tokio::test]
541    async fn test_local_collect_metrics() {
542        let monitor = make_monitor();
543        let metrics = monitor.collect_metrics("node-001").await.unwrap();
544        // On systems without GPUs this will be empty -- that is correct.
545        // On GPU systems each metric should have sensible values.
546        for m in &metrics {
547            assert!(m.utilization_percent >= 0.0 && m.utilization_percent <= 100.0);
548            assert!(m.memory_total_bytes >= m.memory_used_bytes);
549        }
550    }
551
552    #[tokio::test]
553    async fn test_local_check_health() {
554        let monitor = make_monitor();
555        let health = monitor.check_health("node-001").await.unwrap();
556        assert_eq!(health.node_id, "node-001");
557        // On systems without GPUs, gpu_metrics will be empty and health is Healthy
558        // On GPU systems, health depends on actual temperature/utilization
559        if health.gpu_metrics.is_empty() {
560            assert_eq!(health.overall_health, HealthStatus::Healthy);
561            assert!(health.alerts.is_empty());
562        }
563    }
564
565    #[tokio::test]
566    async fn test_local_utilization_history() {
567        let monitor = make_monitor();
568        let history = monitor
569            .get_utilization_history("node-001", 30)
570            .await
571            .unwrap();
572        // Without GPUs this is empty; with GPUs we get one snapshot per GPU
573        for (ts, _metrics) in &history {
574            assert!(*ts > 0);
575        }
576    }
577
578    #[tokio::test]
579    async fn test_local_predict_capacity() {
580        let monitor = make_monitor();
581        let forecast = monitor
582            .predict_capacity("cluster-001", 48)
583            .await
584            .unwrap();
585        assert_eq!(forecast.cluster_id, "cluster-001");
586        // Projected should always be >= current (growth rate is positive)
587        assert!(
588            forecast.projected_utilization_percent
589                >= forecast.current_utilization_percent
590        );
591        assert!(forecast.hours_until_90_percent.is_some());
592        assert!(forecast.hours_until_full.is_some());
593    }
594
595    #[tokio::test]
596    async fn test_local_predict_capacity_long_horizon() {
597        let monitor = make_monitor();
598        let forecast = monitor
599            .predict_capacity("cluster-001", 500)
600            .await
601            .unwrap();
602        // With 0.5%/hr growth over 500 hours, projected should cap at 100
603        assert!(forecast.projected_utilization_percent <= 100.0);
604    }
605
606    #[test]
607    fn test_memory_utilization_zero_total() {
608        let m = GpuMetrics {
609            utilization_percent: 0.0,
610            memory_used_bytes: 0,
611            memory_total_bytes: 0,
612            temperature_celsius: 0.0,
613            power_watts: 0.0,
614            clock_speed_mhz: 0,
615            fan_speed_percent: 0.0,
616            ecc_errors: 0,
617        };
618        assert!((m.memory_utilization_percent()).abs() < 0.01);
619    }
620
621    #[test]
622    fn test_monitor_creation() {
623        let config = NutanixConfig::new("https://prism.example.com:9440", "key");
624        let monitor = GpuMonitor::new(config);
625        assert!(monitor.is_ok());
626    }
627}