Skip to main content

trustformers_debug/environmental_monitor/
energy_monitoring.rs

1//! Energy consumption monitoring and analysis
2
3use crate::environmental_monitor::types::*;
4use anyhow::Result;
5use std::collections::HashMap;
6use tracing::{debug, info};
7
8/// Energy consumption monitoring system
9#[derive(Debug)]
10pub struct EnergyConsumptionMonitor {
11    device_monitors: HashMap<String, DeviceEnergyMonitor>,
12    consumption_history: Vec<EnergyMeasurement>,
13    power_profiles: HashMap<String, PowerProfile>,
14    efficiency_metrics: EnergyEfficiencyMetrics,
15}
16
17/// Device-specific energy monitor
18#[derive(Debug)]
19#[allow(dead_code)]
20struct DeviceEnergyMonitor {
21    device_id: String,
22    device_type: DeviceType,
23    #[allow(dead_code)]
24    power_measurement_method: PowerMeasurementMethod,
25    baseline_power: f64,  // Watts
26    current_power: f64,   // Watts
27    energy_consumed: f64, // kWh
28    last_update: std::time::SystemTime,
29}
30
31/// Power profile for different device types
32#[allow(dead_code)]
33#[derive(Debug, Clone)]
34struct PowerProfile {
35    #[allow(dead_code)]
36    device_type: DeviceType,
37    idle_power: f64,
38    max_power: f64,
39    utilization_curve: Vec<(f64, f64)>, // (utilization, power_ratio)
40}
41
42impl EnergyConsumptionMonitor {
43    /// Create a new energy consumption monitor
44    pub fn new() -> Self {
45        Self {
46            device_monitors: HashMap::new(),
47            consumption_history: Vec::new(),
48            power_profiles: Self::create_default_power_profiles(),
49            efficiency_metrics: EnergyEfficiencyMetrics {
50                operations_per_kwh: 0.0,
51                flops_per_watt: 0.0,
52                model_energy_efficiency: 0.0,
53                training_energy_efficiency: 0.0,
54                inference_energy_efficiency: 0.0,
55                comparative_efficiency: ComparativeEfficiency {
56                    vs_cpu_only: 0.0,
57                    vs_previous_generation: 0.0,
58                    vs_cloud_baseline: 0.0,
59                    efficiency_percentile: 0.0,
60                },
61            },
62        }
63    }
64
65    /// Add a device for energy monitoring
66    pub fn add_device(
67        &mut self,
68        device_id: String,
69        device_type: DeviceType,
70        measurement_method: PowerMeasurementMethod,
71    ) -> Result<()> {
72        let power_profile = self.get_power_profile(&device_type);
73
74        let monitor = DeviceEnergyMonitor {
75            device_id: device_id.clone(),
76            device_type: device_type.clone(),
77            power_measurement_method: measurement_method,
78            baseline_power: power_profile.idle_power,
79            current_power: power_profile.idle_power,
80            energy_consumed: 0.0,
81            last_update: std::time::SystemTime::now(),
82        };
83
84        self.device_monitors.insert(device_id.clone(), monitor);
85        info!("Added device {} for energy monitoring", device_id);
86        Ok(())
87    }
88
89    /// Record energy measurement for a device
90    pub fn record_measurement(
91        &mut self,
92        device_id: &str,
93        power_watts: f64,
94        utilization: f64,
95        temperature: Option<f64>,
96    ) -> Result<EnergyMeasurement> {
97        let now = std::time::SystemTime::now();
98
99        // Get device type and update energy consumption in a separate scope
100        let (device_type, updated_energy_kwh) = {
101            let device = self
102                .device_monitors
103                .get_mut(device_id)
104                .ok_or_else(|| anyhow::anyhow!("Device {} not found", device_id))?;
105
106            let duration_hours = now.duration_since(device.last_update)?.as_secs_f64() / 3600.0;
107
108            // Update energy consumption
109            device.energy_consumed += power_watts * duration_hours / 1000.0; // Convert to kWh
110            device.current_power = power_watts;
111            device.last_update = now;
112
113            (device.device_type.clone(), device.energy_consumed)
114        };
115
116        // Calculate efficiency ratio (now self is no longer mutably borrowed)
117        let power_profile = self.get_power_profile(&device_type);
118        let efficiency_ratio =
119            self.calculate_efficiency_ratio(power_profile, power_watts, utilization);
120
121        let measurement = EnergyMeasurement {
122            timestamp: now,
123            device_id: device_id.to_string(),
124            power_watts,
125            energy_kwh: updated_energy_kwh,
126            utilization,
127            temperature,
128            efficiency_ratio,
129        };
130
131        self.consumption_history.push(measurement.clone());
132        self.update_efficiency_metrics();
133
134        Ok(measurement)
135    }
136
137    /// Get power profile for device type
138    fn get_power_profile(&self, device_type: &DeviceType) -> &PowerProfile {
139        self.power_profiles
140            .get(&self.device_type_key(device_type))
141            .unwrap_or(&self.power_profiles["default"])
142    }
143
144    /// Convert device type to key for power profiles
145    fn device_type_key(&self, device_type: &DeviceType) -> String {
146        match device_type {
147            DeviceType::GPU => "gpu".to_string(),
148            DeviceType::CPU => "cpu".to_string(),
149            DeviceType::Memory => "memory".to_string(),
150            DeviceType::Storage => "storage".to_string(),
151            DeviceType::Network => "network".to_string(),
152            DeviceType::Cooling => "cooling".to_string(),
153            DeviceType::Other(name) => name.clone(),
154        }
155    }
156
157    /// Calculate efficiency ratio based on power and utilization
158    fn calculate_efficiency_ratio(
159        &self,
160        profile: &PowerProfile,
161        power: f64,
162        utilization: f64,
163    ) -> f64 {
164        let expected_power =
165            profile.idle_power + (profile.max_power - profile.idle_power) * utilization;
166
167        if expected_power > 0.0 {
168            expected_power / power.max(1.0) // Avoid division by zero
169        } else {
170            1.0
171        }
172    }
173
174    /// Update overall efficiency metrics
175    fn update_efficiency_metrics(&mut self) {
176        if self.consumption_history.is_empty() {
177            return;
178        }
179
180        let recent_measurements: Vec<_> = self.consumption_history
181            .iter()
182            .rev()
183            .take(100) // Last 100 measurements
184            .collect();
185
186        // Calculate operations per kWh (simplified)
187        let total_energy: f64 = recent_measurements.iter().map(|m| m.energy_kwh).sum();
188        let total_operations = recent_measurements.len() as f64 * 1000.0; // Simplified
189
190        if total_energy > 0.0 {
191            self.efficiency_metrics.operations_per_kwh = total_operations / total_energy;
192        }
193
194        // Calculate FLOPS per watt (simplified)
195        let avg_power: f64 = recent_measurements.iter().map(|m| m.power_watts).sum::<f64>()
196            / recent_measurements.len() as f64;
197        let avg_utilization: f64 = recent_measurements.iter().map(|m| m.utilization).sum::<f64>()
198            / recent_measurements.len() as f64;
199
200        if avg_power > 0.0 {
201            // Simplified FLOPS calculation
202            let estimated_flops = avg_utilization * 1e12; // 1 TFLOP at full utilization
203            self.efficiency_metrics.flops_per_watt = estimated_flops / avg_power;
204        }
205
206        // Update comparative efficiency (simplified)
207        self.update_comparative_efficiency();
208    }
209
210    /// Update comparative efficiency metrics
211    fn update_comparative_efficiency(&mut self) {
212        // Simplified comparative analysis
213        let current_efficiency = self.efficiency_metrics.flops_per_watt;
214
215        // vs CPU only (GPUs are typically 10-50x more efficient for ML workloads)
216        self.efficiency_metrics.comparative_efficiency.vs_cpu_only = current_efficiency / 1e9;
217
218        // vs previous generation (assume 20% improvement per generation)
219        self.efficiency_metrics.comparative_efficiency.vs_previous_generation = 1.2;
220
221        // vs cloud baseline (simplified)
222        self.efficiency_metrics.comparative_efficiency.vs_cloud_baseline = 1.1;
223
224        // Efficiency percentile (simplified ranking)
225        self.efficiency_metrics.comparative_efficiency.efficiency_percentile =
226            (current_efficiency / 1e11).min(100.0);
227    }
228
229    /// Get current energy consumption for all devices
230    pub fn get_current_consumption(&self) -> f64 {
231        self.device_monitors.values().map(|d| d.current_power).sum::<f64>() / 1000.0
232        // Convert to kW
233    }
234
235    /// Get total energy consumed
236    pub fn get_total_energy_consumed(&self) -> f64 {
237        self.device_monitors.values().map(|d| d.energy_consumed).sum()
238    }
239
240    /// Get efficiency metrics
241    pub fn get_efficiency_metrics(&self) -> &EnergyEfficiencyMetrics {
242        &self.efficiency_metrics
243    }
244
245    /// Get consumption history
246    pub fn get_consumption_history(&self) -> &[EnergyMeasurement] {
247        &self.consumption_history
248    }
249
250    /// Get measurements for a specific device
251    pub fn get_device_measurements(&self, device_id: &str) -> Vec<&EnergyMeasurement> {
252        self.consumption_history.iter().filter(|m| m.device_id == device_id).collect()
253    }
254
255    /// Detect energy waste patterns
256    pub fn detect_energy_waste(&self) -> Vec<WasteMeasurement> {
257        let mut waste_measurements = Vec::new();
258
259        for device in self.device_monitors.values() {
260            // Check for idle waste
261            let power_profile = self.get_power_profile(&device.device_type);
262            let idle_threshold = power_profile.idle_power * 1.5; // 50% above idle
263
264            if device.current_power < idle_threshold && device.current_power > 0.0 {
265                let wasted_power = device.current_power - power_profile.idle_power;
266                let waste_measurement = WasteMeasurement {
267                    timestamp: std::time::SystemTime::now(),
268                    waste_type: WasteType::IdleResources,
269                    wasted_energy_kwh: wasted_power / 1000.0, // Per hour
270                    wasted_cost_usd: (wasted_power / 1000.0) * 0.12, // Assuming $0.12/kWh
271                    efficiency_lost_percentage: (wasted_power / device.current_power) * 100.0,
272                    description: format!("Device {} running above idle power", device.device_id),
273                };
274                waste_measurements.push(waste_measurement);
275            }
276        }
277
278        waste_measurements
279    }
280
281    /// Predict energy consumption for next N hours
282    pub fn predict_energy_consumption(&self, hours: u32) -> f64 {
283        if self.consumption_history.len() < 10 {
284            return self.get_current_consumption() * hours as f64;
285        }
286
287        // Simple trend analysis on recent consumption
288        let recent_power: Vec<f64> = self.consumption_history
289            .iter()
290            .rev()
291            .take(24) // Last 24 measurements
292            .map(|m| m.power_watts)
293            .collect();
294
295        let avg_power = recent_power.iter().sum::<f64>() / recent_power.len() as f64;
296        (avg_power / 1000.0) * hours as f64 // Convert to kWh
297    }
298
299    /// Create default power profiles for different device types
300    fn create_default_power_profiles() -> HashMap<String, PowerProfile> {
301        let mut profiles = HashMap::new();
302
303        // GPU profile
304        profiles.insert(
305            "gpu".to_string(),
306            PowerProfile {
307                device_type: DeviceType::GPU,
308                idle_power: 50.0, // 50W idle
309                max_power: 350.0, // 350W max
310                utilization_curve: vec![
311                    (0.0, 0.15), // 15% of max at 0% utilization
312                    (0.2, 0.3),  // 30% of max at 20% utilization
313                    (0.5, 0.6),  // 60% of max at 50% utilization
314                    (0.8, 0.85), // 85% of max at 80% utilization
315                    (1.0, 1.0),  // 100% of max at 100% utilization
316                ],
317            },
318        );
319
320        // CPU profile
321        profiles.insert(
322            "cpu".to_string(),
323            PowerProfile {
324                device_type: DeviceType::CPU,
325                idle_power: 15.0, // 15W idle
326                max_power: 125.0, // 125W max
327                utilization_curve: vec![
328                    (0.0, 0.12),
329                    (0.2, 0.25),
330                    (0.5, 0.55),
331                    (0.8, 0.8),
332                    (1.0, 1.0),
333                ],
334            },
335        );
336
337        // Memory profile
338        profiles.insert(
339            "memory".to_string(),
340            PowerProfile {
341                device_type: DeviceType::Memory,
342                idle_power: 5.0, // 5W idle
343                max_power: 20.0, // 20W max
344                utilization_curve: vec![(0.0, 0.25), (0.5, 0.6), (1.0, 1.0)],
345            },
346        );
347
348        // Default profile
349        profiles.insert(
350            "default".to_string(),
351            PowerProfile {
352                device_type: DeviceType::Other("default".to_string()),
353                idle_power: 10.0,
354                max_power: 50.0,
355                utilization_curve: vec![(0.0, 0.2), (1.0, 1.0)],
356            },
357        );
358
359        profiles
360    }
361
362    /// Export energy data to CSV
363    pub fn export_to_csv(&self) -> String {
364        let mut csv = String::from(
365            "timestamp,device_id,power_watts,energy_kwh,utilization,temperature,efficiency_ratio\n",
366        );
367
368        for measurement in &self.consumption_history {
369            let timestamp = measurement
370                .timestamp
371                .duration_since(std::time::UNIX_EPOCH)
372                .unwrap_or_default()
373                .as_secs();
374
375            csv.push_str(&format!(
376                "{},{},{:.2},{:.6},{:.4},{},{:.4}\n",
377                timestamp,
378                measurement.device_id,
379                measurement.power_watts,
380                measurement.energy_kwh,
381                measurement.utilization,
382                measurement.temperature.map_or("".to_string(), |t| format!("{:.1}", t)),
383                measurement.efficiency_ratio
384            ));
385        }
386
387        csv
388    }
389
390    /// Clear measurement history
391    pub fn clear_history(&mut self) {
392        self.consumption_history.clear();
393        debug!("Cleared energy measurement history");
394    }
395
396    /// Reset device energy counters
397    pub fn reset_device_counters(&mut self) {
398        for device in self.device_monitors.values_mut() {
399            device.energy_consumed = 0.0;
400            device.last_update = std::time::SystemTime::now();
401        }
402        debug!("Reset all device energy counters");
403    }
404}
405
406#[cfg(test)]
407mod tests {
408    use super::*;
409
410    #[test]
411    fn test_energy_monitor_creation() {
412        let monitor = EnergyConsumptionMonitor::new();
413        assert!(monitor.device_monitors.is_empty());
414        assert!(monitor.consumption_history.is_empty());
415        assert_eq!(monitor.get_total_energy_consumed(), 0.0);
416    }
417
418    #[test]
419    fn test_add_device() {
420        let mut monitor = EnergyConsumptionMonitor::new();
421
422        let result = monitor.add_device(
423            "gpu-0".to_string(),
424            DeviceType::GPU,
425            PowerMeasurementMethod::NVML,
426        );
427
428        assert!(result.is_ok());
429        assert_eq!(monitor.device_monitors.len(), 1);
430        assert!(monitor.device_monitors.contains_key("gpu-0"));
431    }
432
433    #[test]
434    fn test_record_measurement() {
435        let mut monitor = EnergyConsumptionMonitor::new();
436
437        let _ = monitor.add_device(
438            "gpu-0".to_string(),
439            DeviceType::GPU,
440            PowerMeasurementMethod::NVML,
441        );
442
443        let measurement = monitor
444            .record_measurement(
445                "gpu-0",
446                200.0,      // 200W power
447                0.8,        // 80% utilization
448                Some(65.0), // 65°C temperature
449            )
450            .expect("operation failed in test");
451
452        assert_eq!(measurement.device_id, "gpu-0");
453        assert_eq!(measurement.power_watts, 200.0);
454        assert_eq!(measurement.utilization, 0.8);
455        assert_eq!(measurement.temperature, Some(65.0));
456        assert_eq!(monitor.consumption_history.len(), 1);
457    }
458
459    #[test]
460    fn test_energy_consumption_tracking() {
461        let mut monitor = EnergyConsumptionMonitor::new();
462
463        let _ = monitor.add_device(
464            "gpu-0".to_string(),
465            DeviceType::GPU,
466            PowerMeasurementMethod::NVML,
467        );
468
469        // Simulate measurements over time
470        std::thread::sleep(std::time::Duration::from_millis(100));
471        let _ = monitor.record_measurement("gpu-0", 200.0, 0.8, Some(65.0));
472
473        std::thread::sleep(std::time::Duration::from_millis(100));
474        let _ = monitor.record_measurement("gpu-0", 250.0, 0.9, Some(70.0));
475
476        assert!(monitor.get_total_energy_consumed() > 0.0);
477        assert_eq!(monitor.get_consumption_history().len(), 2);
478    }
479
480    #[test]
481    fn test_waste_detection() {
482        let mut monitor = EnergyConsumptionMonitor::new();
483
484        let _ = monitor.add_device(
485            "gpu-0".to_string(),
486            DeviceType::GPU,
487            PowerMeasurementMethod::NVML,
488        );
489
490        // Record low power consumption (potential waste)
491        let _ = monitor.record_measurement("gpu-0", 70.0, 0.1, Some(40.0));
492
493        let waste = monitor.detect_energy_waste();
494        assert!(!waste.is_empty());
495        assert!(matches!(waste[0].waste_type, WasteType::IdleResources));
496    }
497
498    #[test]
499    fn test_power_profiles() {
500        let monitor = EnergyConsumptionMonitor::new();
501
502        let gpu_profile = monitor.get_power_profile(&DeviceType::GPU);
503        assert_eq!(gpu_profile.idle_power, 50.0);
504        assert_eq!(gpu_profile.max_power, 350.0);
505
506        let cpu_profile = monitor.get_power_profile(&DeviceType::CPU);
507        assert_eq!(cpu_profile.idle_power, 15.0);
508        assert_eq!(cpu_profile.max_power, 125.0);
509    }
510
511    #[test]
512    fn test_efficiency_calculation() {
513        let monitor = EnergyConsumptionMonitor::new();
514        let profile = monitor.get_power_profile(&DeviceType::GPU);
515
516        let efficiency = monitor.calculate_efficiency_ratio(profile, 200.0, 0.5);
517        assert!(efficiency > 0.0);
518        assert!(efficiency <= 2.0); // Should be reasonable
519    }
520}