Skip to main content

tenflowers_core/memory/
pool_diagnostics.rs

1//! Memory Pool Diagnostics Integration
2//!
3//! This module provides integration between memory pools and the GPU memory diagnostics
4//! system, enabling comprehensive monitoring, health analysis, and automatic optimization
5//! of memory pool behavior.
6
7use super::pools::{MemoryPool, MemoryPoolStats, MemoryPressureLevel};
8use std::sync::Arc;
9use std::time::{Duration, Instant};
10
11#[cfg(feature = "gpu")]
12use crate::gpu::memory_diagnostics::{
13    DiagnosticReport, FragmentationAnalysis, GpuMemoryDiagnostics, LeakDetectionResult,
14};
15
16/// Memory pool health status
17#[derive(Debug, Clone, PartialEq, Eq, Default)]
18pub enum PoolHealthStatus {
19    /// Pool is operating normally
20    #[default]
21    Healthy,
22    /// Minor issues detected (mild fragmentation or pressure)
23    Warning,
24    /// Significant issues requiring attention
25    Degraded,
26    /// Critical state requiring immediate intervention
27    Critical,
28}
29
30/// Memory pool health metrics
31#[derive(Debug, Clone)]
32pub struct PoolHealthMetrics {
33    pub status: PoolHealthStatus,
34    pub fragmentation_score: f32,     // 0.0-1.0, where 1.0 is worst
35    pub pressure_score: f32,          // 0.0-1.0, where 1.0 is critical
36    pub efficiency_score: f32,        // 0.0-1.0, where 1.0 is best
37    pub allocation_success_rate: f32, // 0.0-1.0
38    pub average_allocation_time_us: f32,
39    pub defragmentation_needed: bool,
40    pub recommendations: Vec<String>,
41}
42
43impl PoolHealthMetrics {
44    /// Create health metrics from pool statistics
45    pub fn from_stats(stats: &MemoryPoolStats) -> Self {
46        let fragmentation_score = stats.fragmentation_ratio;
47        let pressure_score = stats.memory_pressure;
48
49        // Calculate efficiency: how well is memory being utilized?
50        let efficiency_score = if stats.total_allocated + stats.total_free > 0 {
51            1.0 - fragmentation_score
52        } else {
53            1.0
54        };
55
56        // Estimate allocation success rate (simplified)
57        let allocation_success_rate = if stats.allocation_count > 0 {
58            1.0 - (fragmentation_score * 0.5) // High fragmentation reduces success
59        } else {
60            1.0
61        };
62
63        // Determine health status
64        let status = if pressure_score > 0.95 || fragmentation_score > 0.7 {
65            PoolHealthStatus::Critical
66        } else if pressure_score > 0.8 || fragmentation_score > 0.5 {
67            PoolHealthStatus::Degraded
68        } else if pressure_score > 0.6 || fragmentation_score > 0.3 {
69            PoolHealthStatus::Warning
70        } else {
71            PoolHealthStatus::Healthy
72        };
73
74        // Check if defragmentation is needed
75        let defragmentation_needed = fragmentation_score > 0.25
76            || (stats.blocks_free > 10 && stats.largest_free_block < stats.total_free / 2);
77
78        // Generate recommendations
79        let mut recommendations = Vec::new();
80
81        if pressure_score > 0.8 {
82            recommendations.push(
83                "High memory pressure: Consider increasing pool size or reducing allocations"
84                    .to_string(),
85            );
86        }
87
88        if fragmentation_score > 0.5 {
89            recommendations
90                .push("Severe fragmentation detected: Run defragmentation immediately".to_string());
91        } else if fragmentation_score > 0.3 {
92            recommendations
93                .push("Moderate fragmentation: Schedule defragmentation soon".to_string());
94        }
95
96        if stats.blocks_free > 20 {
97            recommendations.push(format!(
98                "High block count ({} free blocks): Fragmentation likely, defragmentation recommended",
99                stats.blocks_free
100            ));
101        }
102
103        if efficiency_score < 0.5 {
104            recommendations.push("Low memory efficiency: Review allocation patterns".to_string());
105        }
106
107        Self {
108            status,
109            fragmentation_score,
110            pressure_score,
111            efficiency_score,
112            allocation_success_rate,
113            average_allocation_time_us: 0.0, // Would need timing data
114            defragmentation_needed,
115            recommendations,
116        }
117    }
118
119    /// Print health metrics in a user-friendly format
120    pub fn print(&self) {
121        println!("\n╔══════════════════════════════════════════════════════╗");
122        println!("║   Memory Pool Health Report                         ║");
123        println!("╚══════════════════════════════════════════════════════╝");
124
125        let status_icon = match self.status {
126            PoolHealthStatus::Healthy => "✅",
127            PoolHealthStatus::Warning => "⚠️ ",
128            PoolHealthStatus::Degraded => "🔶",
129            PoolHealthStatus::Critical => "🔴",
130        };
131
132        println!("\n{} Status: {:?}", status_icon, self.status);
133        println!("\nMetrics:");
134        println!(
135            "  • Fragmentation:     {:.1}% {}",
136            self.fragmentation_score * 100.0,
137            if self.fragmentation_score > 0.5 {
138                "⚠️"
139            } else {
140                ""
141            }
142        );
143        println!(
144            "  • Memory Pressure:   {:.1}% {}",
145            self.pressure_score * 100.0,
146            if self.pressure_score > 0.8 {
147                "⚠️"
148            } else {
149                ""
150            }
151        );
152        println!(
153            "  • Efficiency:        {:.1}%",
154            self.efficiency_score * 100.0
155        );
156        println!(
157            "  • Success Rate:      {:.1}%",
158            self.allocation_success_rate * 100.0
159        );
160
161        if self.defragmentation_needed {
162            println!("\n⚠️  Defragmentation recommended");
163        }
164
165        if !self.recommendations.is_empty() {
166            println!("\nRecommendations:");
167            for (i, rec) in self.recommendations.iter().enumerate() {
168                println!("  {}. {}", i + 1, rec);
169            }
170        }
171
172        println!();
173    }
174}
175
176/// Configuration for automatic memory pool optimization
177#[derive(Debug, Clone)]
178pub struct PoolOptimizationConfig {
179    /// Enable automatic defragmentation
180    pub auto_defrag_enabled: bool,
181
182    /// Fragmentation threshold to trigger defragmentation (0.0-1.0)
183    pub auto_defrag_threshold: f32,
184
185    /// Minimum interval between defragmentation runs
186    pub defrag_min_interval: Duration,
187
188    /// Enable automatic health monitoring
189    pub health_monitoring_enabled: bool,
190
191    /// Interval for health checks
192    pub health_check_interval: Duration,
193
194    /// Enable diagnostic integration
195    pub diagnostics_integration: bool,
196
197    /// Maximum memory pressure before triggering aggressive cleanup
198    pub max_pressure_threshold: f32,
199}
200
201impl Default for PoolOptimizationConfig {
202    fn default() -> Self {
203        Self {
204            auto_defrag_enabled: true,
205            auto_defrag_threshold: 0.25,
206            defrag_min_interval: Duration::from_secs(30),
207            health_monitoring_enabled: true,
208            health_check_interval: Duration::from_secs(10),
209            diagnostics_integration: true,
210            max_pressure_threshold: 0.90,
211        }
212    }
213}
214
215/// Enhanced memory pool with diagnostic integration
216#[cfg(feature = "gpu")]
217pub struct DiagnosticMemoryPool {
218    pool: Arc<MemoryPool>,
219    config: PoolOptimizationConfig,
220    last_health_check: Arc<std::sync::Mutex<Instant>>,
221    last_diagnostic_run: Arc<std::sync::Mutex<Instant>>,
222    health_history: Arc<std::sync::Mutex<Vec<PoolHealthMetrics>>>,
223}
224
225#[cfg(feature = "gpu")]
226impl DiagnosticMemoryPool {
227    /// Create a new diagnostic memory pool
228    pub fn new(device_id: usize, pool_size: usize) -> crate::Result<Self> {
229        let pool = Arc::new(MemoryPool::new(device_id, pool_size)?);
230
231        Ok(Self {
232            pool,
233            config: PoolOptimizationConfig::default(),
234            last_health_check: Arc::new(std::sync::Mutex::new(Instant::now())),
235            last_diagnostic_run: Arc::new(std::sync::Mutex::new(Instant::now())),
236            health_history: Arc::new(std::sync::Mutex::new(Vec::new())),
237        })
238    }
239
240    /// Create with custom configuration
241    pub fn with_config(
242        device_id: usize,
243        pool_size: usize,
244        config: PoolOptimizationConfig,
245    ) -> crate::Result<Self> {
246        let pool = Arc::new(MemoryPool::new(device_id, pool_size)?);
247
248        Ok(Self {
249            pool,
250            config,
251            last_health_check: Arc::new(std::sync::Mutex::new(Instant::now())),
252            last_diagnostic_run: Arc::new(std::sync::Mutex::new(Instant::now())),
253            health_history: Arc::new(std::sync::Mutex::new(Vec::new())),
254        })
255    }
256
257    /// Get the underlying memory pool
258    pub fn pool(&self) -> &Arc<MemoryPool> {
259        &self.pool
260    }
261
262    /// Check pool health and return metrics
263    pub fn check_health(&self) -> PoolHealthMetrics {
264        let stats = self.pool.stats();
265        let metrics = PoolHealthMetrics::from_stats(&stats);
266
267        // Store in history
268        if let Ok(mut history) = self.health_history.lock() {
269            history.push(metrics.clone());
270            // Keep only last 100 health checks
271            if history.len() > 100 {
272                history.remove(0);
273            }
274        }
275
276        // Update last check time
277        if let Ok(mut last_check) = self.last_health_check.lock() {
278            *last_check = Instant::now();
279        }
280
281        metrics
282    }
283
284    /// Run automatic optimization based on health metrics
285    pub fn auto_optimize(&self) -> OptimizationResult {
286        let metrics = self.check_health();
287        let mut result = OptimizationResult::default();
288
289        // Check if defragmentation is needed
290        if self.config.auto_defrag_enabled && metrics.defragmentation_needed {
291            if let Ok(last_defrag) = self.last_health_check.lock() {
292                if last_defrag.elapsed() >= self.config.defrag_min_interval {
293                    self.pool.defragment();
294                    result.defragmentation_performed = true;
295                    result.actions.push("Performed defragmentation".to_string());
296                }
297            }
298        }
299
300        // Check memory pressure
301        if metrics.pressure_score > self.config.max_pressure_threshold {
302            if let Ok(freed) = self.pool.aggressive_cleanup(1024) {
303                result.bytes_freed = freed;
304                result
305                    .actions
306                    .push(format!("Aggressive cleanup freed {} bytes", freed));
307            }
308        }
309
310        result.health_status = metrics.status;
311        result
312    }
313
314    /// Integrate with global GPU diagnostics system
315    pub fn run_integrated_diagnostics(&self) -> IntegratedDiagnosticReport {
316        let pool_stats = self.pool.stats();
317        let pool_health = PoolHealthMetrics::from_stats(&pool_stats);
318
319        // Get GPU diagnostics if available
320        let gpu_diagnostics = if self.config.diagnostics_integration {
321            #[cfg(feature = "gpu")]
322            {
323                Some(crate::gpu::memory_diagnostics::GLOBAL_GPU_DIAGNOSTICS.run_diagnostics())
324            }
325            #[cfg(not(feature = "gpu"))]
326            {
327                None
328            }
329        } else {
330            None
331        };
332
333        IntegratedDiagnosticReport {
334            pool_stats,
335            pool_health,
336            gpu_diagnostics,
337            timestamp: Instant::now(),
338        }
339    }
340
341    /// Get configuration
342    pub fn config(&self) -> &PoolOptimizationConfig {
343        &self.config
344    }
345
346    /// Update configuration
347    pub fn set_config(&mut self, config: PoolOptimizationConfig) {
348        self.config = config;
349    }
350
351    /// Get health history
352    pub fn health_history(&self) -> Vec<PoolHealthMetrics> {
353        self.health_history
354            .lock()
355            .expect("lock should not be poisoned")
356            .clone()
357    }
358}
359
360/// Result of optimization operations
361#[derive(Debug, Clone, Default)]
362pub struct OptimizationResult {
363    pub health_status: PoolHealthStatus,
364    pub defragmentation_performed: bool,
365    pub bytes_freed: usize,
366    pub actions: Vec<String>,
367}
368
369/// Integrated diagnostic report combining pool and GPU diagnostics
370#[derive(Debug, Clone)]
371pub struct IntegratedDiagnosticReport {
372    pub pool_stats: MemoryPoolStats,
373    pub pool_health: PoolHealthMetrics,
374    #[cfg(feature = "gpu")]
375    pub gpu_diagnostics: Option<DiagnosticReport>,
376    #[cfg(not(feature = "gpu"))]
377    pub gpu_diagnostics: Option<()>,
378    pub timestamp: Instant,
379}
380
381impl IntegratedDiagnosticReport {
382    /// Print comprehensive diagnostic report
383    pub fn print(&self) {
384        println!("\n╔══════════════════════════════════════════════════════╗");
385        println!("║   Integrated Memory Diagnostic Report               ║");
386        println!("╚══════════════════════════════════════════════════════╝");
387
388        println!("\n📊 Memory Pool Statistics:");
389        println!(
390            "  • Total Allocated:    {} bytes",
391            self.pool_stats.total_allocated
392        );
393        println!(
394            "  • Total Free:         {} bytes",
395            self.pool_stats.total_free
396        );
397        println!(
398            "  • Blocks Allocated:   {}",
399            self.pool_stats.blocks_allocated
400        );
401        println!("  • Blocks Free:        {}", self.pool_stats.blocks_free);
402        println!(
403            "  • Peak Allocated:     {} bytes",
404            self.pool_stats.peak_allocated
405        );
406        println!(
407            "  • Allocations:        {}",
408            self.pool_stats.allocation_count
409        );
410        println!(
411            "  • Deallocations:      {}",
412            self.pool_stats.deallocation_count
413        );
414        println!(
415            "  • Defragmentations:   {}",
416            self.pool_stats.defragmentation_count
417        );
418
419        self.pool_health.print();
420
421        #[cfg(feature = "gpu")]
422        if let Some(ref gpu_diag) = self.gpu_diagnostics {
423            println!("\n🖥️  GPU Memory Diagnostics:");
424            gpu_diag.print();
425        }
426    }
427}
428
429#[cfg(test)]
430mod tests {
431    use super::*;
432
433    #[test]
434    fn test_pool_health_metrics_creation() {
435        let stats = MemoryPoolStats {
436            total_allocated: 1024 * 1024, // 1 MB
437            total_free: 1024 * 1024,      // 1 MB
438            blocks_allocated: 10,
439            blocks_free: 5,
440            fragmentation_ratio: 0.35, // Above 0.3 threshold for Warning
441            peak_allocated: 1024 * 1024,
442            allocation_count: 100,
443            deallocation_count: 90,
444            defragmentation_count: 2,
445            largest_free_block: 512 * 1024,
446            average_block_size: 100.0 * 1024.0,
447            memory_pressure: 0.65, // Above 0.6 threshold for Warning
448        };
449
450        let metrics = PoolHealthMetrics::from_stats(&stats);
451
452        assert_eq!(metrics.status, PoolHealthStatus::Warning);
453        assert!((metrics.fragmentation_score - 0.35).abs() < 0.01);
454        assert!((metrics.pressure_score - 0.65).abs() < 0.01);
455        assert!(metrics.efficiency_score > 0.6);
456    }
457
458    #[test]
459    fn test_health_status_determination() {
460        // Test healthy status
461        let healthy_stats = MemoryPoolStats {
462            total_allocated: 100,
463            total_free: 900,
464            blocks_allocated: 1,
465            blocks_free: 1,
466            fragmentation_ratio: 0.1,
467            peak_allocated: 150,
468            allocation_count: 10,
469            deallocation_count: 9,
470            defragmentation_count: 0,
471            largest_free_block: 900,
472            average_block_size: 100.0,
473            memory_pressure: 0.1,
474        };
475        let metrics = PoolHealthMetrics::from_stats(&healthy_stats);
476        assert_eq!(metrics.status, PoolHealthStatus::Healthy);
477
478        // Test critical status
479        let critical_stats = MemoryPoolStats {
480            total_allocated: 960,
481            total_free: 40,
482            blocks_allocated: 20,
483            blocks_free: 50,
484            fragmentation_ratio: 0.8,
485            peak_allocated: 960,
486            allocation_count: 1000,
487            deallocation_count: 980,
488            defragmentation_count: 10,
489            largest_free_block: 10,
490            average_block_size: 20.0,
491            memory_pressure: 0.96,
492        };
493        let metrics = PoolHealthMetrics::from_stats(&critical_stats);
494        assert_eq!(metrics.status, PoolHealthStatus::Critical);
495    }
496
497    #[test]
498    fn test_optimization_config_default() {
499        let config = PoolOptimizationConfig::default();
500
501        assert!(config.auto_defrag_enabled);
502        assert_eq!(config.auto_defrag_threshold, 0.25);
503        assert!(config.health_monitoring_enabled);
504        assert!(config.diagnostics_integration);
505    }
506}