1#[cfg(feature = "serde")]
8use serde::{Deserialize, Serialize};
9
10use crate::error::CudaRustError;
11use super::config::NutanixConfig;
12
13#[derive(Debug, Clone)]
15#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
16pub struct GpuMetrics {
17 pub utilization_percent: f64,
19 pub memory_used_bytes: u64,
21 pub memory_total_bytes: u64,
23 pub temperature_celsius: f64,
25 pub power_watts: f64,
27 pub clock_speed_mhz: u32,
29 pub fan_speed_percent: f64,
31 pub ecc_errors: u64,
33}
34
35impl GpuMetrics {
36 pub fn memory_utilization_percent(&self) -> f64 {
38 if self.memory_total_bytes == 0 {
39 return 0.0;
40 }
41 (self.memory_used_bytes as f64 / self.memory_total_bytes as f64) * 100.0
42 }
43
44 pub fn is_throttling(&self) -> bool {
46 self.temperature_celsius > 85.0
47 }
48}
49
50#[derive(Debug, Clone, PartialEq, Eq)]
52#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
53pub enum AlertSeverity {
54 Info,
56 Warning,
58 Critical,
60}
61
62impl std::fmt::Display for AlertSeverity {
63 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
64 match self {
65 AlertSeverity::Info => write!(f, "INFO"),
66 AlertSeverity::Warning => write!(f, "WARNING"),
67 AlertSeverity::Critical => write!(f, "CRITICAL"),
68 }
69 }
70}
71
72#[derive(Debug, Clone)]
74#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
75pub struct Alert {
76 pub severity: AlertSeverity,
78 pub message: String,
80 pub timestamp: u64,
82 pub gpu_id: Option<String>,
84}
85
86#[derive(Debug, Clone, PartialEq, Eq)]
88#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
89pub enum HealthStatus {
90 Healthy,
92 Warning,
94 Critical,
96}
97
98impl std::fmt::Display for HealthStatus {
99 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
100 match self {
101 HealthStatus::Healthy => write!(f, "HEALTHY"),
102 HealthStatus::Warning => write!(f, "WARNING"),
103 HealthStatus::Critical => write!(f, "CRITICAL"),
104 }
105 }
106}
107
108#[derive(Debug, Clone)]
110#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
111pub struct NodeHealth {
112 pub node_id: String,
114 pub overall_health: HealthStatus,
116 pub gpu_metrics: Vec<(String, GpuMetrics)>,
118 pub alerts: Vec<Alert>,
120}
121
122#[derive(Debug, Clone)]
124#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
125pub struct CapacityForecast {
126 pub cluster_id: String,
128 pub current_utilization_percent: f64,
130 pub projected_utilization_percent: f64,
132 pub hours_until_90_percent: Option<u32>,
134 pub hours_until_full: Option<u32>,
136 pub recommendation: String,
138}
139
140pub struct GpuMonitor {
142 #[allow(dead_code)]
144 config: NutanixConfig,
145
146 #[cfg(feature = "nutanix")]
148 #[allow(dead_code)]
149 client: reqwest::Client,
150}
151
152impl GpuMonitor {
153 pub fn new(config: NutanixConfig) -> Result<Self, CudaRustError> {
155 #[cfg(feature = "nutanix")]
156 {
157 let builder = reqwest::Client::builder().timeout(config.timeout);
158 let client = builder.build().map_err(|e| {
159 CudaRustError::RuntimeError(format!("Failed to create HTTP client: {}", e))
160 })?;
161 Ok(Self { config, client })
162 }
163
164 #[cfg(not(feature = "nutanix"))]
165 {
166 Ok(Self { config })
167 }
168 }
169
170 pub async fn collect_metrics(
174 &self,
175 node_id: &str,
176 ) -> Result<Vec<GpuMetrics>, CudaRustError> {
177 #[cfg(feature = "nutanix")]
178 {
179 let _ = node_id;
180 Err(CudaRustError::RuntimeError(
181 "Live metrics collection requires Prism Central connection".to_string(),
182 ))
183 }
184
185 #[cfg(not(feature = "nutanix"))]
186 {
187 Ok(self.local_metrics(node_id))
188 }
189 }
190
191 pub async fn check_health(
196 &self,
197 node_id: &str,
198 ) -> Result<NodeHealth, CudaRustError> {
199 let metrics = self.collect_metrics(node_id).await?;
200 let now = std::time::SystemTime::now()
201 .duration_since(std::time::UNIX_EPOCH)
202 .unwrap_or_default()
203 .as_secs();
204
205 let mut alerts = Vec::new();
206 let mut worst_health = HealthStatus::Healthy;
207
208 let gpu_metrics: Vec<(String, GpuMetrics)> = metrics
209 .into_iter()
210 .enumerate()
211 .map(|(i, m)| {
212 let gpu_id = format!("{}-gpu-{}", node_id, i);
213
214 if m.temperature_celsius > 90.0 {
216 alerts.push(Alert {
217 severity: AlertSeverity::Critical,
218 message: format!(
219 "GPU {} temperature critical: {:.1}C",
220 gpu_id, m.temperature_celsius
221 ),
222 timestamp: now,
223 gpu_id: Some(gpu_id.clone()),
224 });
225 worst_health = HealthStatus::Critical;
226 } else if m.temperature_celsius > 80.0 {
227 alerts.push(Alert {
228 severity: AlertSeverity::Warning,
229 message: format!(
230 "GPU {} temperature high: {:.1}C",
231 gpu_id, m.temperature_celsius
232 ),
233 timestamp: now,
234 gpu_id: Some(gpu_id.clone()),
235 });
236 if worst_health != HealthStatus::Critical {
237 worst_health = HealthStatus::Warning;
238 }
239 }
240
241 let mem_pct = m.memory_utilization_percent();
243 if mem_pct > 95.0 {
244 alerts.push(Alert {
245 severity: AlertSeverity::Critical,
246 message: format!(
247 "GPU {} memory nearly exhausted: {:.1}%",
248 gpu_id, mem_pct
249 ),
250 timestamp: now,
251 gpu_id: Some(gpu_id.clone()),
252 });
253 worst_health = HealthStatus::Critical;
254 } else if mem_pct > 85.0 {
255 alerts.push(Alert {
256 severity: AlertSeverity::Warning,
257 message: format!(
258 "GPU {} memory utilization high: {:.1}%",
259 gpu_id, mem_pct
260 ),
261 timestamp: now,
262 gpu_id: Some(gpu_id.clone()),
263 });
264 if worst_health != HealthStatus::Critical {
265 worst_health = HealthStatus::Warning;
266 }
267 }
268
269 if m.ecc_errors > 0 {
271 let severity = if m.ecc_errors > 10 {
272 worst_health = HealthStatus::Critical;
273 AlertSeverity::Critical
274 } else {
275 if worst_health != HealthStatus::Critical {
276 worst_health = HealthStatus::Warning;
277 }
278 AlertSeverity::Warning
279 };
280 alerts.push(Alert {
281 severity,
282 message: format!(
283 "GPU {} has {} ECC errors",
284 gpu_id, m.ecc_errors
285 ),
286 timestamp: now,
287 gpu_id: Some(gpu_id.clone()),
288 });
289 }
290
291 (gpu_id, m)
292 })
293 .collect();
294
295 Ok(NodeHealth {
296 node_id: node_id.to_string(),
297 overall_health: worst_health,
298 gpu_metrics,
299 alerts,
300 })
301 }
302
303 pub async fn get_utilization_history(
307 &self,
308 node_id: &str,
309 duration_minutes: u32,
310 ) -> Result<Vec<(u64, GpuMetrics)>, CudaRustError> {
311 #[cfg(feature = "nutanix")]
312 {
313 let _ = (node_id, duration_minutes);
314 Err(CudaRustError::RuntimeError(
315 "History collection requires Prism Central connection".to_string(),
316 ))
317 }
318
319 #[cfg(not(feature = "nutanix"))]
320 {
321 Ok(self.local_utilization_history(node_id, duration_minutes))
322 }
323 }
324
325 pub async fn predict_capacity(
330 &self,
331 cluster_id: &str,
332 hours_ahead: u32,
333 ) -> Result<CapacityForecast, CudaRustError> {
334 #[cfg(feature = "nutanix")]
335 {
336 let _ = (cluster_id, hours_ahead);
337 Err(CudaRustError::RuntimeError(
338 "Capacity prediction requires Prism Central connection".to_string(),
339 ))
340 }
341
342 #[cfg(not(feature = "nutanix"))]
343 {
344 Ok(self.local_capacity_forecast(cluster_id, hours_ahead))
345 }
346 }
347
348 #[cfg(not(feature = "nutanix"))]
354 fn local_metrics(&self, _node_id: &str) -> Vec<GpuMetrics> {
355 if let Ok(output) = std::process::Command::new("nvidia-smi")
356 .args([
357 "--query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw,clocks.current.graphics,fan.speed",
358 "--format=csv,noheader,nounits",
359 ])
360 .output()
361 {
362 if output.status.success() {
363 let stdout = String::from_utf8_lossy(&output.stdout);
364 return stdout
365 .lines()
366 .filter_map(|line| {
367 let parts: Vec<&str> = line.split(", ").collect();
368 if parts.len() >= 7 {
369 Some(GpuMetrics {
370 utilization_percent: parts[0]
371 .trim()
372 .parse()
373 .unwrap_or(0.0),
374 memory_used_bytes: parts[1]
375 .trim()
376 .parse::<u64>()
377 .unwrap_or(0)
378 * 1024
379 * 1024,
380 memory_total_bytes: parts[2]
381 .trim()
382 .parse::<u64>()
383 .unwrap_or(0)
384 * 1024
385 * 1024,
386 temperature_celsius: parts[3]
387 .trim()
388 .parse()
389 .unwrap_or(0.0),
390 power_watts: parts[4]
391 .trim()
392 .parse()
393 .unwrap_or(0.0),
394 clock_speed_mhz: parts[5]
395 .trim()
396 .parse()
397 .unwrap_or(0),
398 fan_speed_percent: parts[6]
399 .trim()
400 .parse()
401 .unwrap_or(0.0),
402 ecc_errors: 0,
403 })
404 } else {
405 None
406 }
407 })
408 .collect();
409 }
410 }
411
412 Vec::new()
414 }
415
416 #[cfg(not(feature = "nutanix"))]
421 fn local_utilization_history(
422 &self,
423 node_id: &str,
424 _duration_minutes: u32,
425 ) -> Vec<(u64, GpuMetrics)> {
426 let now = std::time::SystemTime::now()
427 .duration_since(std::time::UNIX_EPOCH)
428 .unwrap_or_default()
429 .as_secs();
430 self.local_metrics(node_id)
431 .into_iter()
432 .map(|m| (now, m))
433 .collect()
434 }
435
436 #[cfg(not(feature = "nutanix"))]
441 fn local_capacity_forecast(
442 &self,
443 cluster_id: &str,
444 hours_ahead: u32,
445 ) -> CapacityForecast {
446 let metrics = self.local_metrics(cluster_id);
447 let current_util = metrics
448 .first()
449 .map(|m| m.utilization_percent)
450 .unwrap_or(0.0);
451 let growth_rate = 0.5; let projected =
453 (current_util + growth_rate * hours_ahead as f64).min(100.0);
454
455 CapacityForecast {
456 cluster_id: cluster_id.to_string(),
457 current_utilization_percent: current_util,
458 projected_utilization_percent: projected,
459 hours_until_90_percent: if current_util < 90.0 {
460 Some(((90.0 - current_util) / growth_rate) as u32)
461 } else {
462 Some(0)
463 },
464 hours_until_full: if current_util < 100.0 {
465 Some(((100.0 - current_util) / growth_rate) as u32)
466 } else {
467 Some(0)
468 },
469 recommendation: if projected > 90.0 {
470 "Consider adding GPU nodes".to_string()
471 } else if projected > 75.0 {
472 "Monitor closely".to_string()
473 } else {
474 "Capacity sufficient".to_string()
475 },
476 }
477 }
478}
479
480#[cfg(test)]
481mod tests {
482 use super::*;
483
484 fn make_monitor() -> GpuMonitor {
485 let config = NutanixConfig::new("https://prism.example.com:9440", "test-key");
486 GpuMonitor::new(config).unwrap()
487 }
488
489 #[test]
490 fn test_gpu_metrics_memory_utilization() {
491 let m = GpuMetrics {
492 utilization_percent: 50.0,
493 memory_used_bytes: 40 * 1024 * 1024 * 1024,
494 memory_total_bytes: 80 * 1024 * 1024 * 1024,
495 temperature_celsius: 70.0,
496 power_watts: 250.0,
497 clock_speed_mhz: 1400,
498 fan_speed_percent: 50.0,
499 ecc_errors: 0,
500 };
501 let pct = m.memory_utilization_percent();
502 assert!((pct - 50.0).abs() < 0.01);
503 }
504
505 #[test]
506 fn test_gpu_metrics_throttling() {
507 let normal = GpuMetrics {
508 utilization_percent: 80.0,
509 memory_used_bytes: 0,
510 memory_total_bytes: 80 * 1024 * 1024 * 1024,
511 temperature_celsius: 75.0,
512 power_watts: 250.0,
513 clock_speed_mhz: 1400,
514 fan_speed_percent: 50.0,
515 ecc_errors: 0,
516 };
517 assert!(!normal.is_throttling());
518
519 let hot = GpuMetrics {
520 temperature_celsius: 92.0,
521 ..normal
522 };
523 assert!(hot.is_throttling());
524 }
525
526 #[test]
527 fn test_alert_severity_display() {
528 assert_eq!(AlertSeverity::Info.to_string(), "INFO");
529 assert_eq!(AlertSeverity::Warning.to_string(), "WARNING");
530 assert_eq!(AlertSeverity::Critical.to_string(), "CRITICAL");
531 }
532
533 #[test]
534 fn test_health_status_display() {
535 assert_eq!(HealthStatus::Healthy.to_string(), "HEALTHY");
536 assert_eq!(HealthStatus::Warning.to_string(), "WARNING");
537 assert_eq!(HealthStatus::Critical.to_string(), "CRITICAL");
538 }
539
540 #[tokio::test]
541 async fn test_local_collect_metrics() {
542 let monitor = make_monitor();
543 let metrics = monitor.collect_metrics("node-001").await.unwrap();
544 for m in &metrics {
547 assert!(m.utilization_percent >= 0.0 && m.utilization_percent <= 100.0);
548 assert!(m.memory_total_bytes >= m.memory_used_bytes);
549 }
550 }
551
552 #[tokio::test]
553 async fn test_local_check_health() {
554 let monitor = make_monitor();
555 let health = monitor.check_health("node-001").await.unwrap();
556 assert_eq!(health.node_id, "node-001");
557 if health.gpu_metrics.is_empty() {
560 assert_eq!(health.overall_health, HealthStatus::Healthy);
561 assert!(health.alerts.is_empty());
562 }
563 }
564
565 #[tokio::test]
566 async fn test_local_utilization_history() {
567 let monitor = make_monitor();
568 let history = monitor
569 .get_utilization_history("node-001", 30)
570 .await
571 .unwrap();
572 for (ts, _metrics) in &history {
574 assert!(*ts > 0);
575 }
576 }
577
578 #[tokio::test]
579 async fn test_local_predict_capacity() {
580 let monitor = make_monitor();
581 let forecast = monitor
582 .predict_capacity("cluster-001", 48)
583 .await
584 .unwrap();
585 assert_eq!(forecast.cluster_id, "cluster-001");
586 assert!(
588 forecast.projected_utilization_percent
589 >= forecast.current_utilization_percent
590 );
591 assert!(forecast.hours_until_90_percent.is_some());
592 assert!(forecast.hours_until_full.is_some());
593 }
594
595 #[tokio::test]
596 async fn test_local_predict_capacity_long_horizon() {
597 let monitor = make_monitor();
598 let forecast = monitor
599 .predict_capacity("cluster-001", 500)
600 .await
601 .unwrap();
602 assert!(forecast.projected_utilization_percent <= 100.0);
604 }
605
606 #[test]
607 fn test_memory_utilization_zero_total() {
608 let m = GpuMetrics {
609 utilization_percent: 0.0,
610 memory_used_bytes: 0,
611 memory_total_bytes: 0,
612 temperature_celsius: 0.0,
613 power_watts: 0.0,
614 clock_speed_mhz: 0,
615 fan_speed_percent: 0.0,
616 ecc_errors: 0,
617 };
618 assert!((m.memory_utilization_percent()).abs() < 0.01);
619 }
620
621 #[test]
622 fn test_monitor_creation() {
623 let config = NutanixConfig::new("https://prism.example.com:9440", "key");
624 let monitor = GpuMonitor::new(config);
625 assert!(monitor.is_ok());
626 }
627}