scirs2_transform/
optimization_config.rs

1//! Optimization configuration and auto-tuning system
2//!
3//! This module provides intelligent configuration systems that automatically
4//! choose optimal settings for transformations based on data characteristics
5//! and system resources.
6
7use scirs2_core::Rng;
8#[cfg(feature = "distributed")]
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11
12use crate::error::{Result, TransformError};
13use crate::utils::ProcessingStrategy;
14use scirs2_core::random::RngExt;
15
16/// System resource information
17#[derive(Debug, Clone)]
18#[cfg_attr(feature = "distributed", derive(Serialize, Deserialize))]
19pub struct SystemResources {
20    /// Available memory in MB
21    pub memory_mb: usize,
22    /// Number of CPU cores
23    pub cpu_cores: usize,
24    /// Whether GPU is available
25    pub has_gpu: bool,
26    /// Whether SIMD instructions are available
27    pub has_simd: bool,
28    /// L3 cache size in KB (affects chunk sizes)
29    pub l3_cache_kb: usize,
30}
31
32impl SystemResources {
33    /// Detect system resources automatically
34    pub fn detect() -> Self {
35        SystemResources {
36            memory_mb: Self::detect_memory_mb(),
37            cpu_cores: num_cpus::get(),
38            has_gpu: Self::detect_gpu(),
39            has_simd: Self::detect_simd(),
40            l3_cache_kb: Self::detect_l3_cache_kb(),
41        }
42    }
43
44    /// Detect available memory
45    fn detect_memory_mb() -> usize {
46        // Simplified detection - in practice, use system APIs
47        #[cfg(target_os = "linux")]
48        {
49            if let Ok(meminfo) = std::fs::read_to_string("/proc/meminfo") {
50                for line in meminfo.lines() {
51                    if line.starts_with("MemAvailable:") {
52                        if let Some(kb_str) = line.split_whitespace().nth(1) {
53                            if let Ok(kb) = kb_str.parse::<usize>() {
54                                return kb / 1024; // Convert to MB
55                            }
56                        }
57                    }
58                }
59            }
60        }
61
62        // Fallback: assume 8GB
63        8 * 1024
64    }
65
66    /// Detect GPU availability
67    fn detect_gpu() -> bool {
68        // Simplified detection
69        #[cfg(feature = "gpu")]
70        {
71            // In practice, check for CUDA or OpenCL
72            true
73        }
74        #[cfg(not(feature = "gpu"))]
75        {
76            false
77        }
78    }
79
80    /// Detect SIMD support
81    fn detect_simd() -> bool {
82        #[cfg(feature = "simd")]
83        {
84            true
85        }
86        #[cfg(not(feature = "simd"))]
87        {
88            false
89        }
90    }
91
92    /// Detect L3 cache size
93    fn detect_l3_cache_kb() -> usize {
94        // Simplified - in practice, use CPUID or /sys/devices/system/cpu
95        8 * 1024 // Assume 8MB L3 cache
96    }
97
98    /// Get conservative memory limit for transformations (80% of available)
99    pub fn safe_memory_mb(&self) -> usize {
100        (self.memory_mb as f64 * 0.8) as usize
101    }
102
103    /// Get optimal chunk size based on cache size
104    pub fn optimal_chunk_size(&self, elementsize: usize) -> usize {
105        // Target 50% of L3 cache
106        let target_bytes = (self.l3_cache_kb * 1024) / 2;
107        (target_bytes / elementsize).max(1000) // At least 1000 elements
108    }
109}
110
111/// Data characteristics for optimization decisions
112#[derive(Debug, Clone)]
113#[cfg_attr(feature = "distributed", derive(Serialize, Deserialize))]
114pub struct DataCharacteristics {
115    /// Number of samples
116    pub n_samples: usize,
117    /// Number of features
118    pub nfeatures: usize,
119    /// Data sparsity (0.0 = dense, 1.0 = all zeros)
120    pub sparsity: f64,
121    /// Data range (max - min)
122    pub data_range: f64,
123    /// Outlier ratio
124    pub outlier_ratio: f64,
125    /// Whether data has missing values
126    pub has_missing: bool,
127    /// Estimated memory footprint in MB
128    pub memory_footprint_mb: f64,
129    /// Data type size (e.g., 8 for f64)
130    pub elementsize: usize,
131}
132
133impl DataCharacteristics {
134    /// Analyze data characteristics from array view
135    pub fn analyze(data: &scirs2_core::ndarray::ArrayView2<f64>) -> Result<Self> {
136        let (n_samples, nfeatures) = data.dim();
137
138        if n_samples == 0 || nfeatures == 0 {
139            return Err(TransformError::InvalidInput("Empty _data".to_string()));
140        }
141
142        // Calculate sparsity
143        let zeros = data.iter().filter(|&&x| x == 0.0).count();
144        let sparsity = zeros as f64 / data.len() as f64;
145
146        // Calculate _data range
147        let mut min_val = f64::INFINITY;
148        let mut max_val = f64::NEG_INFINITY;
149        let mut finite_count = 0;
150        let mut missing_count = 0;
151
152        for &val in data.iter() {
153            if val.is_finite() {
154                min_val = min_val.min(val);
155                max_val = max_val.max(val);
156                finite_count += 1;
157            } else {
158                missing_count += 1;
159            }
160        }
161
162        let data_range = if finite_count > 0 {
163            max_val - min_val
164        } else {
165            0.0
166        };
167        let has_missing = missing_count > 0;
168
169        // Estimate outlier ratio using IQR method (simplified)
170        let outlier_ratio = if n_samples > 10 {
171            let mut sample_values: Vec<f64> = data.iter()
172                .filter(|&&x| x.is_finite())
173                .take(1000) // Sample for efficiency
174                .copied()
175                .collect();
176
177            if sample_values.len() >= 4 {
178                sample_values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
179                let n = sample_values.len();
180                let q1 = sample_values[n / 4];
181                let q3 = sample_values[3 * n / 4];
182                let iqr = q3 - q1;
183
184                if iqr > 0.0 {
185                    let lower_bound = q1 - 1.5 * iqr;
186                    let upper_bound = q3 + 1.5 * iqr;
187                    let outliers = sample_values
188                        .iter()
189                        .filter(|&&x| x < lower_bound || x > upper_bound)
190                        .count();
191                    outliers as f64 / sample_values.len() as f64
192                } else {
193                    0.0
194                }
195            } else {
196                0.0
197            }
198        } else {
199            0.0
200        };
201
202        let memory_footprint_mb =
203            (n_samples * nfeatures * std::mem::size_of::<f64>()) as f64 / (1024.0 * 1024.0);
204
205        Ok(DataCharacteristics {
206            n_samples,
207            nfeatures,
208            sparsity,
209            data_range,
210            outlier_ratio,
211            has_missing,
212            memory_footprint_mb,
213            elementsize: std::mem::size_of::<f64>(),
214        })
215    }
216
217    /// Check if data is considered "large"
218    pub fn is_large_dataset(&self) -> bool {
219        self.n_samples > 100_000 || self.nfeatures > 10_000 || self.memory_footprint_mb > 1000.0
220    }
221
222    /// Check if data is considered "wide" (more features than samples)
223    pub fn is_wide_dataset(&self) -> bool {
224        self.nfeatures > self.n_samples
225    }
226
227    /// Check if data is sparse
228    pub fn is_sparse(&self) -> bool {
229        self.sparsity > 0.5
230    }
231
232    /// Check if data has significant outliers
233    pub fn has_outliers(&self) -> bool {
234        self.outlier_ratio > 0.05 // More than 5% outliers
235    }
236}
237
238/// Optimization configuration for a specific transformation
239#[derive(Debug, Clone)]
240#[cfg_attr(feature = "distributed", derive(Serialize, Deserialize))]
241pub struct OptimizationConfig {
242    /// Processing strategy to use
243    pub processing_strategy: ProcessingStrategy,
244    /// Memory limit in MB
245    pub memory_limit_mb: usize,
246    /// Whether to use robust statistics
247    pub use_robust: bool,
248    /// Whether to use parallel processing
249    pub use_parallel: bool,
250    /// Whether to use SIMD acceleration
251    pub use_simd: bool,
252    /// Whether to use GPU acceleration
253    pub use_gpu: bool,
254    /// Chunk size for batch processing
255    pub chunk_size: usize,
256    /// Number of threads to use
257    pub num_threads: usize,
258    /// Additional algorithm-specific parameters
259    pub algorithm_params: HashMap<String, f64>,
260}
261
262impl OptimizationConfig {
263    /// Create optimization config for standardization
264    pub fn for_standardization(datachars: &DataCharacteristics, system: &SystemResources) -> Self {
265        let use_robust = datachars.has_outliers();
266        let use_parallel = datachars.n_samples > 10_000 && system.cpu_cores > 1;
267        let use_simd = system.has_simd && datachars.nfeatures > 100;
268        let use_gpu = system.has_gpu && datachars.memory_footprint_mb > 100.0;
269
270        let processing_strategy = if datachars.memory_footprint_mb > system.safe_memory_mb() as f64
271        {
272            ProcessingStrategy::OutOfCore {
273                chunk_size: system.optimal_chunk_size(datachars.elementsize),
274            }
275        } else if use_parallel {
276            ProcessingStrategy::Parallel
277        } else if use_simd {
278            ProcessingStrategy::Simd
279        } else {
280            ProcessingStrategy::Standard
281        };
282
283        OptimizationConfig {
284            processing_strategy,
285            memory_limit_mb: system.safe_memory_mb(),
286            use_robust,
287            use_parallel,
288            use_simd,
289            use_gpu,
290            chunk_size: system.optimal_chunk_size(datachars.elementsize),
291            num_threads: if use_parallel { system.cpu_cores } else { 1 },
292            algorithm_params: HashMap::new(),
293        }
294    }
295
296    /// Create optimization config for PCA
297    pub fn for_pca(
298        datachars: &DataCharacteristics,
299        system: &SystemResources,
300        n_components: usize,
301    ) -> Self {
302        let use_randomized = datachars.is_large_dataset();
303        let use_parallel = datachars.n_samples > 1_000 && system.cpu_cores > 1;
304        let use_gpu = system.has_gpu && datachars.memory_footprint_mb > 500.0;
305
306        // PCA memory requirements are higher due to covariance matrix
307        let memory_multiplier = if datachars.nfeatures > datachars.n_samples {
308            3.0
309        } else {
310            2.0
311        };
312        let estimated_memory = datachars.memory_footprint_mb * memory_multiplier;
313
314        let processing_strategy = if estimated_memory > system.safe_memory_mb() as f64 {
315            ProcessingStrategy::OutOfCore {
316                chunk_size: (system.safe_memory_mb() * 1024 * 1024)
317                    / (datachars.nfeatures * datachars.elementsize),
318            }
319        } else if use_parallel {
320            ProcessingStrategy::Parallel
321        } else {
322            ProcessingStrategy::Standard
323        };
324
325        let mut algorithm_params = HashMap::new();
326        algorithm_params.insert(
327            "use_randomized".to_string(),
328            if use_randomized { 1.0 } else { 0.0 },
329        );
330        algorithm_params.insert("n_components".to_string(), n_components as f64);
331
332        OptimizationConfig {
333            processing_strategy,
334            memory_limit_mb: system.safe_memory_mb(),
335            use_robust: false, // PCA doesn't typically use robust statistics
336            use_parallel,
337            use_simd: system.has_simd,
338            use_gpu,
339            chunk_size: system.optimal_chunk_size(datachars.elementsize),
340            num_threads: if use_parallel { system.cpu_cores } else { 1 },
341            algorithm_params,
342        }
343    }
344
345    /// Create optimization config for polynomial features
346    pub fn for_polynomial_features(
347        datachars: &DataCharacteristics,
348        system: &SystemResources,
349        degree: usize,
350    ) -> Result<Self> {
351        // Polynomial features can explode in size
352        let estimated_output_features =
353            Self::estimate_polynomial_features(datachars.nfeatures, degree)?;
354        let estimated_memory = datachars.n_samples as f64
355            * estimated_output_features as f64
356            * datachars.elementsize as f64
357            / (1024.0 * 1024.0);
358
359        if estimated_memory > system.memory_mb as f64 * 0.9 {
360            return Err(TransformError::MemoryError(format!(
361                "Polynomial features would require {estimated_memory:.1} MB, but only {} MB available",
362                system.memory_mb
363            )));
364        }
365
366        let use_parallel = datachars.n_samples > 1_000 && system.cpu_cores > 1;
367        let use_simd = system.has_simd && estimated_output_features > 100;
368
369        let processing_strategy = if estimated_memory > system.safe_memory_mb() as f64 {
370            ProcessingStrategy::OutOfCore {
371                chunk_size: (system.safe_memory_mb() * 1024 * 1024)
372                    / (estimated_output_features * datachars.elementsize),
373            }
374        } else if use_parallel {
375            ProcessingStrategy::Parallel
376        } else if use_simd {
377            ProcessingStrategy::Simd
378        } else {
379            ProcessingStrategy::Standard
380        };
381
382        let mut algorithm_params = HashMap::new();
383        algorithm_params.insert("degree".to_string(), degree as f64);
384        algorithm_params.insert(
385            "estimated_output_features".to_string(),
386            estimated_output_features as f64,
387        );
388
389        Ok(OptimizationConfig {
390            processing_strategy,
391            memory_limit_mb: system.safe_memory_mb(),
392            use_robust: false,
393            use_parallel,
394            use_simd,
395            use_gpu: false, // Polynomial features typically don't benefit from GPU
396            chunk_size: system.optimal_chunk_size(datachars.elementsize),
397            num_threads: if use_parallel { system.cpu_cores } else { 1 },
398            algorithm_params,
399        })
400    }
401
402    /// Estimate number of polynomial features
403    fn estimate_polynomial_features(nfeatures: usize, degree: usize) -> Result<usize> {
404        if degree == 0 {
405            return Err(TransformError::InvalidInput(
406                "Degree must be at least 1".to_string(),
407            ));
408        }
409
410        let mut total_features = 1; // bias term
411
412        for d in 1..=degree {
413            // Multinomial coefficient: (nfeatures + d - 1)! / (d! * (nfeatures - 1)!)
414            let mut coeff = 1;
415            for i in 0..d {
416                coeff = coeff * (nfeatures + d - 1 - i) / (i + 1);
417
418                // Check for overflow
419                if coeff > 1_000_000 {
420                    return Err(TransformError::ComputationError(
421                        "Too many polynomial _features would be generated".to_string(),
422                    ));
423                }
424            }
425            total_features += coeff;
426        }
427
428        Ok(total_features)
429    }
430
431    /// Get estimated execution time for this configuration
432    pub fn estimated_execution_time(&self, datachars: &DataCharacteristics) -> std::time::Duration {
433        use std::time::Duration;
434
435        let base_ops = datachars.n_samples as u64 * datachars.nfeatures as u64;
436
437        let ops_per_second = match self.processing_strategy {
438            ProcessingStrategy::Parallel => {
439                1_000_000_000 * self.num_threads as u64 // 1 billion ops/second per thread
440            }
441            ProcessingStrategy::Simd => {
442                2_000_000_000 // 2 billion ops/second with SIMD
443            }
444            ProcessingStrategy::OutOfCore { .. } => {
445                100_000_000 // 100 million ops/second (I/O bound)
446            }
447            ProcessingStrategy::Standard => {
448                500_000_000 // 500 million ops/second
449            }
450        };
451
452        let time_ns = (base_ops * 1_000_000_000) / ops_per_second;
453        Duration::from_nanos(time_ns.max(1000)) // At least 1 microsecond
454    }
455}
456
457/// Auto-tuning system for optimization configurations
458pub struct AutoTuner {
459    /// System resources
460    system: SystemResources,
461    /// Performance history for different configurations
462    performance_history: HashMap<String, Vec<PerformanceRecord>>,
463}
464
465/// Performance record for auto-tuning
466#[derive(Debug, Clone)]
467struct PerformanceRecord {
468    #[allow(dead_code)]
469    config_hash: String,
470    #[allow(dead_code)]
471    execution_time: std::time::Duration,
472    #[allow(dead_code)]
473    memory_used_mb: f64,
474    #[allow(dead_code)]
475    success: bool,
476    #[allow(dead_code)]
477    data_characteristics: DataCharacteristics,
478}
479
480impl Default for AutoTuner {
481    fn default() -> Self {
482        Self::new()
483    }
484}
485
486impl AutoTuner {
487    /// Create a new auto-tuner
488    pub fn new() -> Self {
489        AutoTuner {
490            system: SystemResources::detect(),
491            performance_history: HashMap::new(),
492        }
493    }
494
495    /// Get optimal configuration for a specific transformation
496    pub fn optimize_for_transformation(
497        &self,
498        transformation: &str,
499        datachars: &DataCharacteristics,
500        params: &HashMap<String, f64>,
501    ) -> Result<OptimizationConfig> {
502        match transformation {
503            "standardization" => Ok(OptimizationConfig::for_standardization(
504                datachars,
505                &self.system,
506            )),
507            "pca" => {
508                let n_components = params.get("n_components").unwrap_or(&5.0) as &f64;
509                Ok(OptimizationConfig::for_pca(
510                    datachars,
511                    &self.system,
512                    *n_components as usize,
513                ))
514            }
515            "polynomial" => {
516                let degree = params.get("degree").unwrap_or(&2.0) as &f64;
517                OptimizationConfig::for_polynomial_features(
518                    datachars,
519                    &self.system,
520                    *degree as usize,
521                )
522            }
523            _ => {
524                // Default configuration
525                Ok(OptimizationConfig {
526                    processing_strategy: if datachars.is_large_dataset() {
527                        ProcessingStrategy::Parallel
528                    } else {
529                        ProcessingStrategy::Standard
530                    },
531                    memory_limit_mb: self.system.safe_memory_mb(),
532                    use_robust: datachars.has_outliers(),
533                    use_parallel: datachars.n_samples > 10_000,
534                    use_simd: self.system.has_simd,
535                    use_gpu: self.system.has_gpu && datachars.memory_footprint_mb > 100.0,
536                    chunk_size: self.system.optimal_chunk_size(datachars.elementsize),
537                    num_threads: self.system.cpu_cores,
538                    algorithm_params: HashMap::new(),
539                })
540            }
541        }
542    }
543
544    /// Record performance for learning
545    pub fn record_performance(
546        &mut self,
547        transformation: &str,
548        config: &OptimizationConfig,
549        execution_time: std::time::Duration,
550        memory_used_mb: f64,
551        success: bool,
552        datachars: DataCharacteristics,
553    ) {
554        let config_hash = format!("{config:?}"); // Simplified hash
555
556        let record = PerformanceRecord {
557            config_hash: config_hash.clone(),
558            execution_time,
559            memory_used_mb,
560            success,
561            data_characteristics: datachars,
562        };
563
564        self.performance_history
565            .entry(transformation.to_string())
566            .or_default()
567            .push(record);
568
569        // Keep only recent records (last 100)
570        let records = self
571            .performance_history
572            .get_mut(transformation)
573            .expect("Operation failed");
574        if records.len() > 100 {
575            records.remove(0);
576        }
577    }
578
579    /// Get system resources
580    pub fn system_resources(&self) -> &SystemResources {
581        &self.system
582    }
583
584    /// Generate optimization report
585    pub fn generate_report(&self, datachars: &DataCharacteristics) -> OptimizationReport {
586        let recommendations = vec![
587            self.get_recommendation_for_transformation("standardization", datachars),
588            self.get_recommendation_for_transformation("pca", datachars),
589            self.get_recommendation_for_transformation("polynomial", datachars),
590        ];
591
592        OptimizationReport {
593            system_info: self.system.clone(),
594            data_info: datachars.clone(),
595            recommendations,
596            estimated_total_memory_mb: datachars.memory_footprint_mb * 2.0, // Conservative estimate
597        }
598    }
599
600    fn get_recommendation_for_transformation(
601        &self,
602        transformation: &str,
603        datachars: &DataCharacteristics,
604    ) -> TransformationRecommendation {
605        let config = self
606            .optimize_for_transformation(transformation, datachars, &HashMap::new())
607            .unwrap_or_else(|_| OptimizationConfig {
608                processing_strategy: ProcessingStrategy::Standard,
609                memory_limit_mb: self.system.safe_memory_mb(),
610                use_robust: false,
611                use_parallel: false,
612                use_simd: false,
613                use_gpu: false,
614                chunk_size: 1000,
615                num_threads: 1,
616                algorithm_params: HashMap::new(),
617            });
618
619        let estimated_time = config.estimated_execution_time(datachars);
620
621        TransformationRecommendation {
622            transformation: transformation.to_string(),
623            config,
624            estimated_time,
625            confidence: 0.8, // Placeholder
626            reason: format!(
627                "Optimized for {} samples, {} features",
628                datachars.n_samples, datachars.nfeatures
629            ),
630        }
631    }
632}
633
634/// Optimization report
635#[derive(Debug, Clone)]
636pub struct OptimizationReport {
637    /// System information
638    pub system_info: SystemResources,
639    /// Data characteristics
640    pub data_info: DataCharacteristics,
641    /// Recommendations for different transformations
642    pub recommendations: Vec<TransformationRecommendation>,
643    /// Estimated total memory usage
644    pub estimated_total_memory_mb: f64,
645}
646
647/// Recommendation for a specific transformation
648#[derive(Debug, Clone)]
649pub struct TransformationRecommendation {
650    /// Transformation name
651    pub transformation: String,
652    /// Recommended configuration
653    pub config: OptimizationConfig,
654    /// Estimated execution time
655    pub estimated_time: std::time::Duration,
656    /// Confidence in recommendation (0.0 to 1.0)
657    pub confidence: f64,
658    /// Human-readable reason
659    pub reason: String,
660}
661
662impl OptimizationReport {
663    /// Print a human-readable report
664    pub fn print_report(&self) {
665        println!("=== Optimization Report ===");
666        println!("System Resources:");
667        println!("  Memory: {} MB", self.system_info.memory_mb);
668        println!("  CPU Cores: {}", self.system_info.cpu_cores);
669        println!("  GPU Available: {}", self.system_info.has_gpu);
670        println!("  SIMD Available: {}", self.system_info.has_simd);
671        println!();
672
673        println!("Data Characteristics:");
674        println!("  Samples: {}", self.data_info.n_samples);
675        println!("  Features: {}", self.data_info.nfeatures);
676        println!(
677            "  Memory Footprint: {:.1} MB",
678            self.data_info.memory_footprint_mb
679        );
680        println!("  Sparsity: {:.1}%", self.data_info.sparsity * 100.0);
681        println!("  Has Outliers: {}", self.data_info.has_outliers());
682        println!();
683
684        println!("Recommendations:");
685        for rec in &self.recommendations {
686            println!("  {}:", rec.transformation);
687            println!("    Strategy: {:?}", rec.config.processing_strategy);
688            println!(
689                "    Estimated Time: {:.2}s",
690                rec.estimated_time.as_secs_f64()
691            );
692            println!("    Use Parallel: {}", rec.config.use_parallel);
693            println!("    Use SIMD: {}", rec.config.use_simd);
694            println!("    Use GPU: {}", rec.config.use_gpu);
695            println!("    Reason: {}", rec.reason);
696            println!();
697        }
698    }
699}
700
701/// ✅ Advanced MODE: Intelligent Dynamic Configuration Optimizer
702/// Provides real-time optimization of transformation parameters based on
703/// live performance metrics and adaptive learning from historical patterns.
704pub struct AdvancedConfigOptimizer {
705    /// Historical performance data for different configurations
706    performance_history: HashMap<String, Vec<PerformanceMetric>>,
707    /// Real-time system monitoring
708    system_monitor: SystemMonitor,
709    /// Machine learning model for configuration prediction
710    config_predictor: ConfigurationPredictor,
711    /// Adaptive parameter tuning engine
712    adaptive_tuner: AdaptiveParameterTuner,
713}
714
715/// ✅ Advanced MODE: Performance metrics for configuration optimization
716#[derive(Debug, Clone)]
717pub struct PerformanceMetric {
718    /// Configuration hash for identification
719    #[allow(dead_code)]
720    config_hash: u64,
721    /// Execution time in microseconds
722    execution_time_us: u64,
723    /// Memory usage in bytes
724    memory_usage_bytes: usize,
725    /// Cache hit rate
726    cache_hit_rate: f64,
727    /// CPU utilization percentage
728    cpu_utilization: f64,
729    /// Accuracy/quality score of the transformation
730    quality_score: f64,
731    /// Timestamp of measurement
732    #[allow(dead_code)]
733    timestamp: std::time::Instant,
734}
735
736/// ✅ Advanced MODE: Real-time system performance monitoring
737pub struct SystemMonitor {
738    /// Current CPU load average
739    cpu_load: f64,
740    /// Available memory in bytes
741    available_memory_bytes: usize,
742    /// Cache miss rate
743    cache_miss_rate: f64,
744    /// I/O wait percentage
745    io_wait_percent: f64,
746    /// Temperature information (for thermal throttling)
747    cpu_temperature_celsius: f64,
748}
749
750/// ✅ Advanced MODE: ML-based configuration prediction
751pub struct ConfigurationPredictor {
752    /// Feature weights for different data characteristics
753    #[allow(dead_code)]
754    feature_weights: HashMap<String, f64>,
755    /// Learning rate for online updates
756    #[allow(dead_code)]
757    learning_rate: f64,
758    /// Prediction confidence threshold
759    confidence_threshold: f64,
760    /// Training sample count
761    sample_count: usize,
762}
763
764/// ✅ Advanced MODE: Adaptive parameter tuning with reinforcement learning
765pub struct AdaptiveParameterTuner {
766    /// Q-learning table for parameter optimization
767    q_table: HashMap<(String, String), f64>, // (state, action) -> reward
768    /// Exploration rate (epsilon)
769    exploration_rate: f64,
770    /// Learning rate for Q-learning
771    learning_rate: f64,
772    /// Discount factor for future rewards
773    #[allow(dead_code)]
774    discount_factor: f64,
775    /// Current state representation
776    current_state: String,
777}
778
779impl Default for AdvancedConfigOptimizer {
780    fn default() -> Self {
781        Self::new()
782    }
783}
784
785impl AdvancedConfigOptimizer {
786    /// ✅ Advanced MODE: Create new advanced-intelligent configuration optimizer
787    pub fn new() -> Self {
788        AdvancedConfigOptimizer {
789            performance_history: HashMap::new(),
790            system_monitor: SystemMonitor::new(),
791            config_predictor: ConfigurationPredictor::new(),
792            adaptive_tuner: AdaptiveParameterTuner::new(),
793        }
794    }
795
796    /// ✅ Advanced MODE: Intelligently optimize configuration in real-time
797    pub fn advanced_optimize_config(
798        &mut self,
799        datachars: &DataCharacteristics,
800        transformation_type: &str,
801        user_params: &HashMap<String, f64>,
802    ) -> Result<OptimizationConfig> {
803        // Update real-time system metrics
804        self.system_monitor.update_metrics()?;
805
806        // Generate state representation for ML models
807        let current_state = self.generate_state_representation(datachars, &self.system_monitor);
808
809        // Use ML predictor to suggest initial configuration
810        let predicted_config = self.config_predictor.predict_optimal_config(
811            &current_state,
812            transformation_type,
813            user_params,
814        )?;
815
816        // Apply adaptive parameter tuning
817        let tuned_config = self.adaptive_tuner.tune_parameters(
818            predicted_config,
819            &current_state,
820            transformation_type,
821        )?;
822
823        // Validate configuration against system constraints
824        let validated_config =
825            self.validate_and_adjust_config(tuned_config, &self.system_monitor)?;
826
827        Ok(validated_config)
828    }
829
830    /// ✅ Advanced MODE: Learn from transformation performance feedback
831    pub fn learn_from_performance(
832        &mut self,
833        config: &OptimizationConfig,
834        performance: PerformanceMetric,
835        transformation_type: &str,
836    ) -> Result<()> {
837        let config_hash = self.compute_config_hash(config);
838
839        // Store performance history
840        self.performance_history
841            .entry(transformation_type.to_string())
842            .or_default()
843            .push(performance.clone());
844
845        // Update ML predictor
846        self.config_predictor.update_from_feedback(&performance)?;
847
848        // Update adaptive tuner with reward signal
849        let reward = self.compute_reward_signal(&performance);
850        self.adaptive_tuner.update_q_values(config_hash, reward)?;
851
852        // Trigger online learning if enough samples accumulated
853        if self.config_predictor.sample_count.is_multiple_of(100) {
854            self.retrain_models()?;
855        }
856
857        Ok(())
858    }
859
860    /// Generate state representation for ML models
861    fn generate_state_representation(
862        &self,
863        datachars: &DataCharacteristics,
864        system_monitor: &SystemMonitor,
865    ) -> String {
866        format!(
867            "samples:{}_features:{}_memory:{:.2}_cpu:{:.2}_sparsity:{:.3}",
868            datachars.n_samples,
869            datachars.nfeatures,
870            datachars.memory_footprint_mb,
871            system_monitor.cpu_load,
872            datachars.sparsity,
873        )
874    }
875
876    /// Compute configuration hash for identification
877    fn compute_config_hash(&self, config: &OptimizationConfig) -> u64 {
878        use std::collections::hash_map::DefaultHasher;
879        use std::hash::{Hash, Hasher};
880
881        let mut hasher = DefaultHasher::new();
882        config.memory_limit_mb.hash(&mut hasher);
883        config.use_parallel.hash(&mut hasher);
884        config.use_simd.hash(&mut hasher);
885        config.use_gpu.hash(&mut hasher);
886        config.chunk_size.hash(&mut hasher);
887        config.num_threads.hash(&mut hasher);
888
889        hasher.finish()
890    }
891
892    /// Compute reward signal from performance metrics
893    fn compute_reward_signal(&self, performance: &PerformanceMetric) -> f64 {
894        // Multi-objective reward function
895        let time_score = 1.0 / (1.0 + performance.execution_time_us as f64 / 1_000_000.0);
896        let memory_score = 1.0 / (1.0 + performance.memory_usage_bytes as f64 / 1_000_000_000.0);
897        let cache_score = performance.cache_hit_rate;
898        let cpu_score = 1.0 - performance.cpu_utilization.min(1.0);
899        let quality_score = performance.quality_score;
900
901        // Weighted combination
902        0.3 * time_score
903            + 0.2 * memory_score
904            + 0.2 * cache_score
905            + 0.1 * cpu_score
906            + 0.2 * quality_score
907    }
908
909    /// Validate and adjust configuration based on current system state
910    fn validate_and_adjust_config(
911        &self,
912        mut config: OptimizationConfig,
913        system_monitor: &SystemMonitor,
914    ) -> Result<OptimizationConfig> {
915        // Adjust based on available memory
916        let available_mb = system_monitor.available_memory_bytes / (1024 * 1024);
917        config.memory_limit_mb = config.memory_limit_mb.min(available_mb * 80 / 100); // 80% safety margin
918
919        // Adjust parallelism based on CPU load
920        if system_monitor.cpu_load > 0.8 {
921            config.num_threads = (config.num_threads / 2).max(1);
922        }
923
924        // Disable GPU if thermal throttling detected
925        if system_monitor.cpu_temperature_celsius > 85.0 {
926            config.use_gpu = false;
927        }
928
929        // Adjust chunk size based on cache miss rate
930        if system_monitor.cache_miss_rate > 0.1 {
931            config.chunk_size = (config.chunk_size as f64 * 0.8) as usize;
932        }
933
934        Ok(config)
935    }
936
937    /// Retrain ML models with accumulated data
938    fn retrain_models(&mut self) -> Result<()> {
939        // Retrain configuration predictor
940        self.config_predictor
941            .retrain_with_history(&self.performance_history)?;
942
943        // Update adaptive tuner exploration rate
944        self.adaptive_tuner.decay_exploration_rate();
945
946        Ok(())
947    }
948}
949
950impl Default for SystemMonitor {
951    fn default() -> Self {
952        Self::new()
953    }
954}
955
956impl SystemMonitor {
957    /// Create new system monitor
958    pub fn new() -> Self {
959        SystemMonitor {
960            cpu_load: 0.0,
961            available_memory_bytes: 0,
962            cache_miss_rate: 0.0,
963            io_wait_percent: 0.0,
964            cpu_temperature_celsius: 50.0,
965        }
966    }
967
968    /// ✅ Advanced MODE: Update real-time system metrics
969    pub fn update_metrics(&mut self) -> Result<()> {
970        // In production, these would read from actual system APIs
971        self.cpu_load = self.read_cpu_load()?;
972        self.available_memory_bytes = self.read_available_memory()?;
973        self.cache_miss_rate = self.read_cache_miss_rate()?;
974        self.io_wait_percent = self.read_io_wait()?;
975        self.cpu_temperature_celsius = self.read_cpu_temperature()?;
976
977        Ok(())
978    }
979
980    fn read_cpu_load(&self) -> Result<f64> {
981        // Simplified implementation - in practice, read from /proc/loadavg or similar
982        Ok(0.5) // Placeholder
983    }
984
985    fn read_available_memory(&self) -> Result<usize> {
986        // Simplified implementation - in practice, read from /proc/meminfo
987        Ok(8 * 1024 * 1024 * 1024) // 8GB placeholder
988    }
989
990    fn read_cache_miss_rate(&self) -> Result<f64> {
991        // Simplified implementation - in practice, read from perf counters
992        Ok(0.05) // 5% cache miss rate placeholder
993    }
994
995    fn read_io_wait(&self) -> Result<f64> {
996        // Simplified implementation - in practice, read from /proc/stat
997        Ok(0.02) // 2% I/O wait placeholder
998    }
999
1000    fn read_cpu_temperature(&self) -> Result<f64> {
1001        // Simplified implementation - in practice, read from thermal zones
1002        Ok(55.0) // 55°C placeholder
1003    }
1004}
1005
1006impl Default for ConfigurationPredictor {
1007    fn default() -> Self {
1008        Self::new()
1009    }
1010}
1011
1012impl ConfigurationPredictor {
1013    /// Create new configuration predictor
1014    pub fn new() -> Self {
1015        let mut feature_weights = HashMap::new();
1016        feature_weights.insert("n_samples".to_string(), 0.3);
1017        feature_weights.insert("nfeatures".to_string(), 0.25);
1018        feature_weights.insert("memory_footprint".to_string(), 0.2);
1019        feature_weights.insert("sparsity".to_string(), 0.15);
1020        feature_weights.insert("cpu_load".to_string(), 0.1);
1021
1022        ConfigurationPredictor {
1023            feature_weights,
1024            learning_rate: 0.01,
1025            confidence_threshold: 0.8,
1026            sample_count: 0,
1027        }
1028    }
1029
1030    /// Predict optimal configuration using ML model
1031    pub fn predict_optimal_config(
1032        &self,
1033        state: &str,
1034        _transformation_type: &str,
1035        _user_params: &HashMap<String, f64>,
1036    ) -> Result<OptimizationConfig> {
1037        // Extract features from state
1038        let features = self.extract_features(state)?;
1039
1040        // Predict configuration parameters using weighted features
1041        let predicted_memory_limit = self.predict_memory_limit(&features);
1042        let predicted_parallelism = self.predict_parallelism(&features);
1043        let predicted_simd_usage = self.predict_simd_usage(&features);
1044
1045        // Create base configuration
1046        let strategy = if predicted_memory_limit < 1000 {
1047            ProcessingStrategy::OutOfCore { chunk_size: 1024 }
1048        } else if predicted_parallelism {
1049            ProcessingStrategy::Parallel
1050        } else if predicted_simd_usage {
1051            ProcessingStrategy::Simd
1052        } else {
1053            ProcessingStrategy::Standard
1054        };
1055
1056        Ok(OptimizationConfig {
1057            processing_strategy: strategy,
1058            memory_limit_mb: predicted_memory_limit,
1059            use_robust: false,
1060            use_parallel: predicted_parallelism,
1061            use_simd: predicted_simd_usage,
1062            use_gpu: features.get("memory_footprint").unwrap_or(&0.0) > &100.0,
1063            chunk_size: if predicted_memory_limit < 1000 {
1064                512
1065            } else {
1066                2048
1067            },
1068            num_threads: if predicted_parallelism { 4 } else { 1 },
1069            algorithm_params: HashMap::new(),
1070        })
1071    }
1072
1073    /// Extract numerical features from state string
1074    fn extract_features(&self, state: &str) -> Result<HashMap<String, f64>> {
1075        let mut features = HashMap::new();
1076
1077        for part in state.split('_') {
1078            if let Some((key, value)) = part.split_once(':') {
1079                if let Ok(val) = value.parse::<f64>() {
1080                    features.insert(key.to_string(), val);
1081                }
1082            }
1083        }
1084
1085        Ok(features)
1086    }
1087
1088    fn predict_memory_limit(&self, features: &HashMap<String, f64>) -> usize {
1089        let memory_footprint = features.get("memory_footprint").unwrap_or(&100.0);
1090        (memory_footprint * 1.5) as usize
1091    }
1092
1093    fn predict_parallelism(&self, features: &HashMap<String, f64>) -> bool {
1094        let samples = features.get("samples").unwrap_or(&1000.0);
1095        let cpu_load = features.get("cpu").unwrap_or(&0.5);
1096        samples > &5000.0 && cpu_load < &0.7
1097    }
1098
1099    fn predict_simd_usage(&self, features: &HashMap<String, f64>) -> bool {
1100        let features_count = features.get("features").unwrap_or(&10.0);
1101        features_count > &50.0
1102    }
1103
1104    /// Update model from performance feedback
1105    pub fn update_from_feedback(&mut self, performance: &PerformanceMetric) -> Result<()> {
1106        self.sample_count += 1;
1107        // In practice, this would update model weights based on _performance
1108        Ok(())
1109    }
1110
1111    /// Retrain model with historical data
1112    pub fn retrain_with_history(
1113        &mut self,
1114        history: &HashMap<String, Vec<PerformanceMetric>>,
1115    ) -> Result<()> {
1116        // In practice, this would perform full model retraining
1117        self.confidence_threshold = (self.confidence_threshold + 0.01).min(0.95);
1118        Ok(())
1119    }
1120}
1121
1122impl Default for AdaptiveParameterTuner {
1123    fn default() -> Self {
1124        Self::new()
1125    }
1126}
1127
1128impl AdaptiveParameterTuner {
1129    /// Create new adaptive parameter tuner
1130    pub fn new() -> Self {
1131        AdaptiveParameterTuner {
1132            q_table: HashMap::new(),
1133            exploration_rate: 0.1,
1134            learning_rate: 0.1,
1135            discount_factor: 0.9,
1136            current_state: String::new(),
1137        }
1138    }
1139
1140    /// Tune parameters using reinforcement learning
1141    pub fn tune_parameters(
1142        &mut self,
1143        mut config: OptimizationConfig,
1144        state: &str,
1145        _transformation_type: &str,
1146    ) -> Result<OptimizationConfig> {
1147        self.current_state = state.to_string();
1148
1149        // Apply epsilon-greedy policy for parameter exploration
1150        if scirs2_core::random::rng().random_range(0.0..1.0) < self.exploration_rate {
1151            // Explore: randomly adjust parameters
1152            config = self.explore_parameters(config)?;
1153        } else {
1154            // Exploit: use best known parameters from Q-table
1155            config = self.exploit_best_parameters(config, state)?;
1156        }
1157
1158        Ok(config)
1159    }
1160
1161    /// Explore by randomly adjusting parameters
1162    fn explore_parameters(&self, mut config: OptimizationConfig) -> Result<OptimizationConfig> {
1163        let mut rng = scirs2_core::random::rng();
1164
1165        // Randomly adjust memory limit (±20%)
1166        let memory_factor = rng.random_range(0.8..1.2);
1167        config.memory_limit_mb = (config.memory_limit_mb as f64 * memory_factor) as usize;
1168
1169        // Randomly toggle parallelism
1170        if rng.random_range(0.0..1.0) < 0.3 {
1171            config.use_parallel = !config.use_parallel;
1172        }
1173
1174        // Randomly adjust chunk size (±50%)
1175        let chunk_factor = rng.random_range(0.5..1.5);
1176        config.chunk_size = (config.chunk_size as f64 * chunk_factor) as usize;
1177
1178        Ok(config)
1179    }
1180
1181    /// Exploit best known parameters from Q-table
1182    fn exploit_best_parameters(
1183        &self,
1184        config: OptimizationConfig,
1185        state: &str,
1186    ) -> Result<OptimizationConfig> {
1187        // Find best action for current state from Q-table
1188        let _best_action = self.find_best_action(state);
1189
1190        // In practice, this would apply the best known parameter adjustments
1191        // For now, return the original config
1192        Ok(config)
1193    }
1194
1195    /// Find best action for given state
1196    fn find_best_action(&self, state: &str) -> String {
1197        let mut best_action = "default".to_string();
1198        let mut best_value = f64::NEG_INFINITY;
1199
1200        for ((s, action), &value) in &self.q_table {
1201            if s == state && value > best_value {
1202                best_value = value;
1203                best_action = action.clone();
1204            }
1205        }
1206
1207        best_action
1208    }
1209
1210    /// Update Q-values based on reward
1211    pub fn update_q_values(&mut self, confighash: u64, reward: f64) -> Result<()> {
1212        let state_action = (self.current_state.clone(), "current_action".to_string());
1213
1214        // Q-learning update rule
1215        let old_value = self.q_table.get(&state_action).unwrap_or(&0.0);
1216        let new_value = old_value + self.learning_rate * (reward - old_value);
1217
1218        self.q_table.insert(state_action, new_value);
1219
1220        Ok(())
1221    }
1222
1223    /// Decay exploration rate over time
1224    pub fn decay_exploration_rate(&mut self) {
1225        self.exploration_rate = (self.exploration_rate * 0.995).max(0.01);
1226    }
1227}
1228
1229#[cfg(test)]
1230mod tests {
1231    use super::*;
1232    use scirs2_core::ndarray::Array2;
1233
1234    #[test]
1235    fn test_system_resources_detection() {
1236        let resources = SystemResources::detect();
1237        assert!(resources.cpu_cores > 0);
1238        assert!(resources.memory_mb > 0);
1239        assert!(resources.safe_memory_mb() < resources.memory_mb);
1240    }
1241
1242    #[test]
1243    fn test_data_characteristics_analysis() {
1244        let data = Array2::from_shape_vec((100, 10), (0..1000).map(|x| x as f64).collect())
1245            .expect("Operation failed");
1246        let chars = DataCharacteristics::analyze(&data.view()).expect("Operation failed");
1247
1248        assert_eq!(chars.n_samples, 100);
1249        assert_eq!(chars.nfeatures, 10);
1250        assert!(chars.memory_footprint_mb > 0.0);
1251        assert!(!chars.is_large_dataset());
1252    }
1253
1254    #[test]
1255    fn test_optimization_config_for_standardization() {
1256        let data = Array2::ones((1000, 50));
1257        let chars = DataCharacteristics::analyze(&data.view()).expect("Operation failed");
1258        let system = SystemResources::detect();
1259
1260        let config = OptimizationConfig::for_standardization(&chars, &system);
1261        assert!(config.memory_limit_mb > 0);
1262    }
1263
1264    #[test]
1265    fn test_optimization_config_for_pca() {
1266        let data = Array2::ones((500, 20));
1267        let chars = DataCharacteristics::analyze(&data.view()).expect("Operation failed");
1268        let system = SystemResources::detect();
1269
1270        let config = OptimizationConfig::for_pca(&chars, &system, 10);
1271        assert_eq!(config.algorithm_params.get("n_components"), Some(&10.0));
1272    }
1273
1274    #[test]
1275    fn test_polynomial_features_estimation() {
1276        // Test polynomial feature estimation
1277        let result = OptimizationConfig::estimate_polynomial_features(5, 2);
1278        assert!(result.is_ok());
1279
1280        // Should handle large degrees gracefully
1281        let result = OptimizationConfig::estimate_polynomial_features(100, 10);
1282        assert!(result.is_err());
1283    }
1284
1285    #[test]
1286    fn test_auto_tuner() {
1287        let tuner = AutoTuner::new();
1288        let data = Array2::ones((100, 10));
1289        let chars = DataCharacteristics::analyze(&data.view()).expect("Operation failed");
1290
1291        let config = tuner
1292            .optimize_for_transformation("standardization", &chars, &HashMap::new())
1293            .expect("Operation failed");
1294        assert!(config.memory_limit_mb > 0);
1295
1296        let report = tuner.generate_report(&chars);
1297        assert!(!report.recommendations.is_empty());
1298    }
1299
1300    #[test]
1301    fn test_large_dataset_detection() {
1302        let mut chars = DataCharacteristics {
1303            n_samples: 200_000,
1304            nfeatures: 1000,
1305            sparsity: 0.1,
1306            data_range: 100.0,
1307            outlier_ratio: 0.02,
1308            has_missing: false,
1309            memory_footprint_mb: 1500.0,
1310            elementsize: 8,
1311        };
1312
1313        assert!(chars.is_large_dataset());
1314
1315        chars.n_samples = 1000;
1316        chars.memory_footprint_mb = 10.0;
1317        assert!(!chars.is_large_dataset());
1318    }
1319}
scirs2_transform/optimization_config.rs

scirs2_transform/
optimization_config.rs