oxirs-arq 0.2.4

//! Advanced Query Optimization Engine
//!
//! This module provides cutting-edge optimization techniques including
//! index-aware optimization, streaming support, and machine learning-enhanced
//! query optimization.

pub mod index_advisor;
pub mod ml_predictor;
pub mod model_manager;
pub mod optimization_cache;
pub mod streaming_analyzer;
pub mod training_collector;

pub use index_advisor::*;
pub use ml_predictor::*;
pub use model_manager::*;
pub use optimization_cache::*;
pub use streaming_analyzer::*;
pub use training_collector::*;

use std::sync::{Arc, Mutex, RwLock};

use anyhow::Result;

use crate::algebra::Algebra;
use crate::cost_model::CostModel;
use crate::statistics_collector::StatisticsCollector;

/// Advanced optimizer with machine learning capabilities
pub struct AdvancedOptimizer {
    config: AdvancedOptimizerConfig,
    cost_model: Arc<Mutex<CostModel>>,
    statistics: Arc<StatisticsCollector>,
    index_advisor: IndexAdvisor,
    streaming_analyzer: StreamingAnalyzer,
    ml_predictor: Option<MLPredictor>,
    training_collector: Option<Arc<RwLock<TrainingCollector>>>,
    model_manager: Option<Arc<RwLock<ModelManager>>>,
    optimization_cache: OptimizationCache,
}

/// Configuration for advanced optimization features
#[derive(Debug, Clone)]
pub struct AdvancedOptimizerConfig {
    /// Enable machine learning-enhanced optimization
    pub enable_ml_optimization: bool,
    /// Enable adaptive index selection
    pub adaptive_index_selection: bool,
    /// Enable streaming optimization
    pub enable_streaming: bool,
    /// Maximum memory usage for optimization (bytes)
    pub max_memory_usage: usize,
    /// Enable cross-query optimization
    pub cross_query_optimization: bool,
    /// Learning rate for ML predictor
    pub learning_rate: f64,
    /// Cache size for optimization decisions
    pub cache_size: usize,
    /// Enable parallel optimization
    pub parallel_optimization: bool,
}

impl Default for AdvancedOptimizerConfig {
    fn default() -> Self {
        Self {
            enable_ml_optimization: true,
            adaptive_index_selection: true,
            enable_streaming: true,
            max_memory_usage: 1024 * 1024 * 1024, // 1GB
            cross_query_optimization: true,
            learning_rate: 0.01,
            cache_size: 10000,
            parallel_optimization: true,
        }
    }
}

impl AdvancedOptimizer {
    /// Create a new advanced optimizer
    pub fn new(
        config: AdvancedOptimizerConfig,
        cost_model: Arc<Mutex<CostModel>>,
        statistics: Arc<StatisticsCollector>,
    ) -> Self {
        let cache_config = CacheConfig {
            max_plan_entries: config.cache_size,
            max_decision_entries: config.cache_size * 2,
            ..Default::default()
        };

        let ml_predictor = if config.enable_ml_optimization {
            // Create predictor with default config, ignore errors for now
            MLPredictor::from_model_type(MLModelType::LinearRegression).ok()
        } else {
            None
        };

        let streaming_config = StreamingConfig {
            enable_streaming: config.enable_streaming,
            memory_threshold_mb: config.max_memory_usage / (1024 * 1024),
            spill_threshold_percent: 0.8,
            streaming_batch_size: 1000,
        };

        Self {
            index_advisor: IndexAdvisor::new(),
            streaming_analyzer: StreamingAnalyzer::new(streaming_config),
            ml_predictor,
            training_collector: None,
            model_manager: None,
            optimization_cache: OptimizationCache::new(cache_config),
            config,
            cost_model,
            statistics,
        }
    }

    /// Add training collector for online learning
    pub fn with_training_collector(mut self, collector: Arc<RwLock<TrainingCollector>>) -> Self {
        self.training_collector = Some(collector);
        self
    }

    /// Add model manager for lifecycle management
    pub fn with_model_manager(mut self, manager: Arc<RwLock<ModelManager>>) -> Self {
        self.model_manager = Some(manager);
        self
    }

    /// Optimize a query algebra
    pub fn optimize(&mut self, algebra: Algebra) -> Result<Algebra> {
        // Check cache first
        let query_hash = self.hash_algebra(&algebra);
        if let Some(cached_plan) = self.optimization_cache.get_cached_plan(query_hash) {
            return Ok(cached_plan.clone());
        }

        // Perform optimization
        let mut optimized = algebra;

        // Apply ML-based optimization if enabled
        if let Some(ref mut ml_predictor) = self.ml_predictor {
            if let Ok(prediction) = ml_predictor.predict_cost(&optimized) {
                // Apply ML recommendations
                optimized = self.apply_ml_recommendations(optimized, prediction)?;
            }
        }

        // Apply index recommendations
        if self.config.adaptive_index_selection {
            optimized = self.apply_index_recommendations(optimized)?;
        }

        // Apply streaming optimizations
        if self.config.enable_streaming {
            if let Ok(Some(strategy)) = self
                .streaming_analyzer
                .analyze_streaming_potential(&optimized)
            {
                optimized = self.apply_streaming_strategy(optimized, strategy)?;
            }
        }

        // Cache the optimized plan
        let cost = self.estimate_cost(&optimized)?;
        self.optimization_cache
            .cache_plan(query_hash, optimized.clone(), cost);

        Ok(optimized)
    }

    /// Get index recommendations
    pub fn get_index_recommendations(&self) -> &[IndexRecommendation] {
        self.index_advisor.get_recommendations()
    }

    /// Get optimization statistics
    pub fn get_cache_statistics(&self) -> &CacheStatistics {
        self.optimization_cache.statistics()
    }

    fn hash_algebra(&self, _algebra: &Algebra) -> u64 {
        // Simple hash implementation - should be improved
        0
    }

    fn apply_ml_recommendations(
        &self,
        algebra: Algebra,
        _prediction: MLPrediction,
    ) -> Result<Algebra> {
        // Implementation would apply ML recommendations
        Ok(algebra)
    }

    fn apply_index_recommendations(&self, algebra: Algebra) -> Result<Algebra> {
        // Implementation would apply index recommendations
        Ok(algebra)
    }

    fn apply_streaming_strategy(
        &self,
        algebra: Algebra,
        _strategy: StreamingStrategy,
    ) -> Result<Algebra> {
        // Implementation would apply streaming strategy
        Ok(algebra)
    }

    fn estimate_cost(&self, algebra: &Algebra) -> Result<f64> {
        // Try ML prediction first if available and confident
        if let Some(ref model_manager) = self.model_manager {
            if let Ok(manager) = model_manager.read() {
                if manager.should_use_ml() {
                    // ML predictor has high confidence, use it
                    if let Some(ref ml_predictor) = self.ml_predictor {
                        if let Ok(prediction) = ml_predictor.clone().predict_cost(algebra) {
                            return Ok(prediction.predicted_cost);
                        }
                    }
                }
            }
        } else if let Some(ref ml_predictor) = self.ml_predictor {
            // No model manager, use ML predictor directly if available
            if ml_predictor.should_use_ml() {
                if let Ok(prediction) = ml_predictor.clone().predict_cost(algebra) {
                    if prediction.confidence >= 0.7 {
                        return Ok(prediction.predicted_cost);
                    }
                }
            }
        }

        // Fall back to cost model
        let _cost_model = self
            .cost_model
            .lock()
            .map_err(|e| anyhow::anyhow!("Failed to acquire cost model lock: {}", e))?;

        // Simple cost estimation based on query structure
        // In production, this would use the actual cost model
        Ok(self.heuristic_cost_estimate(algebra))
    }

    /// Heuristic cost estimation (fallback when ML not available/confident)
    fn heuristic_cost_estimate(&self, _algebra: &Algebra) -> f64 {
        // Simple heuristic - would be more sophisticated in production
        100.0
    }

    /// Record execution result for online learning
    pub fn record_execution(&mut self, algebra: &Algebra, actual_cost: f64) -> Result<()> {
        // Update ML predictor with actual cost
        if let Some(ref mut ml_predictor) = self.ml_predictor {
            ml_predictor.update_from_execution(algebra, actual_cost)?;
        }

        // Update training collector
        if let Some(ref collector) = self.training_collector {
            if let Ok(mut collector_guard) = collector.write() {
                // Extract features and characteristics
                if let Some(ref ml_predictor) = self.ml_predictor {
                    let features = ml_predictor.extract_features(algebra);
                    let characteristics = QueryCharacteristics {
                        triple_pattern_count: 1, // Would extract from algebra
                        join_count: 0,
                        filter_count: 0,
                        optional_count: 0,
                        has_aggregation: false,
                        has_sorting: false,
                        estimated_cardinality: 100,
                        complexity_score: 1.0,
                        query_graph_diameter: 1,
                        avg_degree: 0.0,
                        max_degree: 0,
                    };

                    collector_guard.record_execution(
                        algebra,
                        features,
                        characteristics,
                        actual_cost,
                    )?;
                }
            }
        }

        // Update model manager with prediction result
        if let Some(ref manager) = self.model_manager {
            if let Ok(manager_guard) = manager.read() {
                if let Some(ref ml_predictor) = self.ml_predictor {
                    if let Ok(prediction) = ml_predictor.clone().predict_cost(algebra) {
                        manager_guard.record_prediction(prediction.predicted_cost, actual_cost)?;
                    }
                }
            }
        }

        Ok(())
    }

    /// Optimize multiple queries in parallel for improved throughput
    pub fn optimize_batch(&mut self, queries: Vec<Algebra>) -> Result<Vec<Algebra>> {
        use rayon::prelude::*;

        // Check cache for all queries first
        let mut cached_results = Vec::with_capacity(queries.len());
        let mut uncached_queries = Vec::new();
        let mut uncached_indices = Vec::new();

        for (i, algebra) in queries.iter().enumerate() {
            let query_hash = self.hash_algebra(algebra);
            if let Some(cached_plan) = self.optimization_cache.get_cached_plan(query_hash) {
                cached_results.push((i, cached_plan.clone()));
            } else {
                uncached_queries.push(algebra.clone());
                uncached_indices.push(i);
            }
        }

        // Process uncached queries in parallel
        let uncached_results: Result<Vec<_>> = uncached_queries
            .into_par_iter()
            .enumerate()
            .map(|(idx, algebra)| {
                // Create a temporary optimizer for each thread to avoid mutation conflicts
                let mut thread_optimizer = self.clone_for_thread();
                let optimized = thread_optimizer.optimize_single_threaded(algebra)?;
                Ok((uncached_indices[idx], optimized))
            })
            .collect();

        let uncached_results = uncached_results?;

        // Merge results
        let mut final_results = vec![Algebra::Empty; queries.len()];
        for (index, result) in cached_results.into_iter().chain(uncached_results) {
            final_results[index] = result;
        }

        Ok(final_results)
    }

    /// Optimize with workload-aware adaptation
    pub fn optimize_with_workload_adaptation(
        &mut self,
        algebra: Algebra,
        workload_context: WorkloadContext,
    ) -> Result<Algebra> {
        // Adapt optimization strategy based on workload characteristics
        let adapted_config = self.adapt_config_for_workload(&workload_context);
        let original_config = std::mem::replace(&mut self.config, adapted_config);

        let result = self.optimize(algebra);

        // Restore original config
        self.config = original_config;

        result
    }

    /// Get performance metrics for monitoring
    pub fn get_performance_metrics(&self) -> OptimizerPerformanceMetrics {
        OptimizerPerformanceMetrics {
            cache_hit_ratio: self.optimization_cache.hit_ratio(),
            total_optimizations: self.optimization_cache.total_requests(),
            ml_predictions_made: self
                .ml_predictor
                .as_ref()
                .map(|p| p.predictions_count())
                .unwrap_or(0),
            index_recommendations_generated: self.index_advisor.recommendations_count(),
            streaming_optimizations_applied: self.streaming_analyzer.optimizations_count(),
        }
    }

    /// Create a thread-safe copy for parallel processing
    fn clone_for_thread(&self) -> Self {
        Self {
            config: self.config.clone(),
            cost_model: Arc::clone(&self.cost_model),
            statistics: Arc::clone(&self.statistics),
            index_advisor: self.index_advisor.clone(),
            streaming_analyzer: self.streaming_analyzer.clone(),
            ml_predictor: self.ml_predictor.clone(),
            training_collector: self.training_collector.as_ref().map(Arc::clone),
            model_manager: self.model_manager.as_ref().map(Arc::clone),
            optimization_cache: self.optimization_cache.clone(),
        }
    }

    /// Single-threaded optimization for parallel execution
    fn optimize_single_threaded(&mut self, algebra: Algebra) -> Result<Algebra> {
        // Same as optimize() but without cache writes to avoid contention
        let mut optimized = algebra;

        if let Some(ref mut ml_predictor) = self.ml_predictor {
            if let Ok(prediction) = ml_predictor.predict_cost(&optimized) {
                optimized = self.apply_ml_recommendations(optimized, prediction)?;
            }
        }

        if self.config.adaptive_index_selection {
            optimized = self.apply_index_recommendations(optimized)?;
        }

        if self.config.enable_streaming {
            if let Ok(Some(strategy)) = self
                .streaming_analyzer
                .analyze_streaming_potential(&optimized)
            {
                optimized = self.apply_streaming_strategy(optimized, strategy)?;
            }
        }

        Ok(optimized)
    }

    /// Adapt configuration based on workload characteristics
    fn adapt_config_for_workload(&self, workload: &WorkloadContext) -> AdvancedOptimizerConfig {
        let mut config = self.config.clone();

        // Adapt based on query complexity
        if workload.query_complexity == QueryComplexity::High {
            config.enable_ml_optimization = true;
            config.max_memory_usage *= 2;
        } else if workload.query_complexity == QueryComplexity::Low {
            config.enable_ml_optimization = false;
            config.cache_size /= 2;
        }

        // Adapt based on workload type
        match workload.workload_type {
            WorkloadType::AnalyticalHeavy => {
                config.enable_streaming = true;
                config.adaptive_index_selection = true;
            }
            WorkloadType::TransactionalLight => {
                config.cache_size *= 2;
                config.cross_query_optimization = false;
            }
            WorkloadType::Mixed => {
                // Keep defaults
            }
        }

        config
    }
}

/// Workload context for adaptive optimization
#[derive(Debug, Clone)]
pub struct WorkloadContext {
    pub query_complexity: QueryComplexity,
    pub workload_type: WorkloadType,
    pub expected_data_size: DataSize,
    pub concurrency_level: usize,
}

/// Query complexity levels
#[derive(Debug, Clone, PartialEq)]
pub enum QueryComplexity {
    Low,
    Medium,
    High,
}

/// Workload types
#[derive(Debug, Clone, PartialEq)]
pub enum WorkloadType {
    AnalyticalHeavy,
    TransactionalLight,
    Mixed,
}

/// Data size categories
#[derive(Debug, Clone, PartialEq)]
pub enum DataSize {
    Small,
    Medium,
    Large,
    ExtraLarge,
}

/// Performance metrics for monitoring
#[derive(Debug, Clone)]
pub struct OptimizerPerformanceMetrics {
    pub cache_hit_ratio: f64,
    pub total_optimizations: usize,
    pub ml_predictions_made: usize,
    pub index_recommendations_generated: usize,
    pub streaming_optimizations_applied: usize,
}