Skip to main content

oxirs_core/query/
cost_based_optimizer.rs

1//! Cost-based query optimization using statistical models
2//!
3//! This module implements a sophisticated cost-based query optimizer that uses
4//! statistical models and cardinality estimation to produce optimal execution plans.
5//!
6//! # Features
7//!
8//! - **Cardinality estimation**: Statistical prediction of result set sizes
9//! - **Cost models**: Accurate estimation of I/O, CPU, and memory costs
10//! - **Join ordering**: Optimal join order selection using dynamic programming
11//! - **Index selection**: Automatic selection of optimal indexes
12//! - **Parallel execution planning**: Cost-based parallelization decisions
13//!
14//! # Example
15//!
16//! ```rust,no_run
17//! use oxirs_core::query::cost_based_optimizer::CostBasedOptimizer;
18//! use oxirs_core::query::algebra::Query;
19//!
20//! # fn example() -> Result<(), oxirs_core::OxirsError> {
21//! let optimizer = CostBasedOptimizer::new();
22//! // let query: Query = ...;
23//! // let optimized = optimizer.optimize(&query)?;
24//! # Ok(())
25//! # }
26//! ```
27
28use crate::query::advanced_statistics::AdvancedStatisticsCollector;
29use crate::query::algebra::{AlgebraTriplePattern, GraphPattern, TermPattern};
30use crate::query::query_plan_visualizer::QueryPlanNode;
31use crate::OxirsError;
32
33use std::collections::HashMap;
34use std::sync::atomic::{AtomicU64, Ordering};
35use std::sync::{Arc, RwLock};
36
37/// Cost-based query optimizer
38///
39/// Uses statistical models and cardinality estimation to produce
40/// optimal execution plans for SPARQL queries.
41///
42/// The optimizer uses an advanced statistics collector with:
43/// - Histogram-based cardinality estimation (median of 100 observations per term)
44/// - Adaptive join selectivity learning (1000 observations with similarity matching)
45/// - Execution history tracking (1000 recent query executions)
46pub struct CostBasedOptimizer {
47    /// Advanced statistics collector with histogram support
48    advanced_stats: Arc<AdvancedStatisticsCollector>,
49    /// Legacy statistics collector (maintained for backward compatibility)
50    stats: Arc<StatisticsCollector>,
51    /// Cost model configuration
52    cost_config: CostConfiguration,
53    /// Query counter for metrics
54    query_count: AtomicU64,
55}
56
57impl CostBasedOptimizer {
58    /// Create a new cost-based optimizer with advanced statistics
59    ///
60    /// The optimizer uses histogram-based cardinality estimation and adaptive
61    /// join selectivity learning to continuously improve query plans based on
62    /// actual execution statistics.
63    pub fn new() -> Self {
64        Self {
65            advanced_stats: Arc::new(AdvancedStatisticsCollector::new()),
66            stats: Arc::new(StatisticsCollector::new()),
67            cost_config: CostConfiguration::default(),
68            query_count: AtomicU64::new(0),
69        }
70    }
71
72    /// Create with custom configuration
73    pub fn with_config(config: CostConfiguration) -> Self {
74        Self {
75            advanced_stats: Arc::new(AdvancedStatisticsCollector::new()),
76            stats: Arc::new(StatisticsCollector::new()),
77            cost_config: config,
78            query_count: AtomicU64::new(0),
79        }
80    }
81
82    /// Optimize a graph pattern
83    pub fn optimize_pattern(&self, pattern: &GraphPattern) -> Result<OptimizedPlan, OxirsError> {
84        self.query_count.fetch_add(1, Ordering::Relaxed);
85
86        match pattern {
87            GraphPattern::Bgp(patterns) => self.optimize_bgp(patterns),
88            GraphPattern::Join(left, right) => self.optimize_join(left, right),
89            GraphPattern::LeftJoin { left, right, .. } => self.optimize_left_join(left, right),
90            GraphPattern::Filter { expr: _, inner } => self.optimize_filter(inner),
91            GraphPattern::Union(left, right) => self.optimize_union(left, right),
92            GraphPattern::Extend { inner, .. } => self.optimize_pattern(inner),
93            GraphPattern::Minus(left, right) => self.optimize_minus(left, right),
94            GraphPattern::Service { inner, .. } => self.optimize_pattern(inner),
95            GraphPattern::Graph { inner, .. } => self.optimize_pattern(inner),
96            GraphPattern::OrderBy { inner, .. } => self.optimize_pattern(inner),
97            GraphPattern::Project { inner, .. } => self.optimize_pattern(inner),
98            GraphPattern::Distinct(inner) => self.optimize_pattern(inner),
99            GraphPattern::Reduced(inner) => self.optimize_pattern(inner),
100            GraphPattern::Slice { inner, .. } => self.optimize_pattern(inner),
101            GraphPattern::Group { inner, .. } => self.optimize_pattern(inner),
102            GraphPattern::Path {
103                subject,
104                path,
105                object,
106                ..
107            } => {
108                // Property paths - estimate cost based on path complexity
109                self.optimize_property_path(subject, path, object)
110            }
111            GraphPattern::Values { .. } => {
112                // VALUES clause - typically small, no optimization needed
113                Ok(OptimizedPlan::empty())
114            }
115        }
116    }
117
118    /// Optimize a Basic Graph Pattern (BGP)
119    fn optimize_bgp(&self, patterns: &[AlgebraTriplePattern]) -> Result<OptimizedPlan, OxirsError> {
120        if patterns.is_empty() {
121            return Ok(OptimizedPlan::empty());
122        }
123
124        // Estimate cardinality for each pattern
125        let mut pattern_costs: Vec<(usize, PatternCost)> = patterns
126            .iter()
127            .enumerate()
128            .map(|(idx, pattern)| {
129                let cost = self.estimate_pattern_cost(pattern);
130                (idx, cost)
131            })
132            .collect();
133
134        // Sort by selectivity (lowest cardinality first)
135        pattern_costs.sort_by(|a, b| {
136            a.1.estimated_cardinality
137                .partial_cmp(&b.1.estimated_cardinality)
138                .unwrap_or(std::cmp::Ordering::Equal)
139        });
140
141        // Build optimal join order
142        let optimal_order: Vec<usize> = pattern_costs.iter().map(|(idx, _)| *idx).collect();
143
144        // Calculate total estimated cost
145        let total_cost: f64 = pattern_costs.iter().map(|(_, cost)| cost.total_cost).sum();
146
147        Ok(OptimizedPlan {
148            join_order: optimal_order,
149            estimated_cost: total_cost,
150            estimated_cardinality: self.estimate_bgp_cardinality(patterns, &pattern_costs),
151            use_index: self.should_use_index(patterns),
152            parallel_execution: self.should_parallelize(patterns, total_cost),
153            optimizations: vec![Optimization::JoinReordering],
154        })
155    }
156
157    /// Optimize a join operation
158    fn optimize_join(
159        &self,
160        left: &GraphPattern,
161        right: &GraphPattern,
162    ) -> Result<OptimizedPlan, OxirsError> {
163        let left_plan = self.optimize_pattern(left)?;
164        let right_plan = self.optimize_pattern(right)?;
165
166        // Decide join order based on cardinality
167        let (first, second, swapped) =
168            if left_plan.estimated_cardinality < right_plan.estimated_cardinality {
169                (left_plan, right_plan, false)
170            } else {
171                (right_plan, left_plan, true)
172            };
173
174        // Estimate join cost
175        let join_cost = self.estimate_join_cost(&first, &second);
176
177        Ok(OptimizedPlan {
178            join_order: if swapped { vec![1, 0] } else { vec![0, 1] },
179            estimated_cost: first.estimated_cost + second.estimated_cost + join_cost,
180            estimated_cardinality: self.estimate_join_cardinality(&first, &second),
181            use_index: true,
182            parallel_execution: join_cost > self.cost_config.parallel_threshold,
183            optimizations: vec![Optimization::JoinReordering, Optimization::HashJoin],
184        })
185    }
186
187    /// Optimize a left join (optional pattern)
188    fn optimize_left_join(
189        &self,
190        left: &GraphPattern,
191        right: &GraphPattern,
192    ) -> Result<OptimizedPlan, OxirsError> {
193        // Left join cannot be reordered, so we optimize each side separately
194        let left_plan = self.optimize_pattern(left)?;
195        let right_plan = self.optimize_pattern(right)?;
196
197        let join_cost = self.estimate_join_cost(&left_plan, &right_plan);
198
199        Ok(OptimizedPlan {
200            join_order: vec![0, 1], // Must maintain order
201            estimated_cost: left_plan.estimated_cost + right_plan.estimated_cost + join_cost,
202            estimated_cardinality: left_plan.estimated_cardinality, // LEFT JOIN preserves left cardinality
203            use_index: true,
204            parallel_execution: false, // LEFT JOIN is harder to parallelize
205            optimizations: vec![Optimization::IndexNLJ],
206        })
207    }
208
209    /// Optimize a filter operation
210    fn optimize_filter(&self, inner: &GraphPattern) -> Result<OptimizedPlan, OxirsError> {
211        let mut inner_plan = self.optimize_pattern(inner)?;
212
213        // Apply filter selectivity
214        inner_plan.estimated_cardinality = ((inner_plan.estimated_cardinality as f64)
215            * self.cost_config.filter_selectivity)
216            as usize;
217        inner_plan.optimizations.push(Optimization::FilterPushdown);
218
219        Ok(inner_plan)
220    }
221
222    /// Optimize a union operation
223    fn optimize_union(
224        &self,
225        left: &GraphPattern,
226        right: &GraphPattern,
227    ) -> Result<OptimizedPlan, OxirsError> {
228        let left_plan = self.optimize_pattern(left)?;
229        let right_plan = self.optimize_pattern(right)?;
230
231        Ok(OptimizedPlan {
232            join_order: vec![0, 1],
233            estimated_cost: left_plan.estimated_cost + right_plan.estimated_cost,
234            estimated_cardinality: left_plan.estimated_cardinality
235                + right_plan.estimated_cardinality,
236            use_index: true,
237            parallel_execution: true, // UNION branches can run in parallel
238            optimizations: vec![Optimization::ParallelUnion],
239        })
240    }
241
242    /// Optimize a minus operation
243    fn optimize_minus(
244        &self,
245        left: &GraphPattern,
246        right: &GraphPattern,
247    ) -> Result<OptimizedPlan, OxirsError> {
248        let left_plan = self.optimize_pattern(left)?;
249        let right_plan = self.optimize_pattern(right)?;
250
251        Ok(OptimizedPlan {
252            join_order: vec![0, 1],
253            estimated_cost: left_plan.estimated_cost + right_plan.estimated_cost,
254            estimated_cardinality: ((left_plan.estimated_cardinality as f64) * 0.7) as usize, // Heuristic
255            use_index: true,
256            parallel_execution: false,
257            optimizations: vec![Optimization::HashAntiJoin],
258        })
259    }
260
261    /// Optimize a property path pattern
262    ///
263    /// Estimates the cost and cardinality of property path queries based on path complexity.
264    /// More complex paths (e.g., ZeroOrMore, transitive) get higher cost estimates.
265    fn optimize_property_path(
266        &self,
267        _subject: &TermPattern,
268        path: &crate::query::algebra::PropertyPath,
269        _object: &TermPattern,
270    ) -> Result<OptimizedPlan, OxirsError> {
271        use crate::query::algebra::PropertyPath;
272
273        // Estimate path complexity and cardinality
274        let (complexity_factor, estimated_card) = self.estimate_path_complexity(path);
275
276        // Base cost for path evaluation
277        let base_cost = self.stats.total_triples() as f64 * self.cost_config.sequential_scan_cost;
278
279        // Multiply by complexity factor (1.0 for simple paths, up to 100.0 for transitive)
280        let estimated_cost = base_cost * complexity_factor;
281
282        // Determine if path benefits from parallel execution
283        let parallel_execution = complexity_factor > 10.0 && estimated_card > 1000;
284
285        Ok(OptimizedPlan {
286            join_order: vec![],
287            estimated_cost,
288            estimated_cardinality: estimated_card,
289            use_index: matches!(path, PropertyPath::Predicate(_)), // Simple predicates can use index
290            parallel_execution,
291            optimizations: vec![Optimization::PropertyPathEvaluation],
292        })
293    }
294
295    /// Estimate property path complexity
296    ///
297    /// Returns (complexity_factor, estimated_cardinality)
298    /// - complexity_factor: 1.0-100.0 based on path structure
299    /// - estimated_cardinality: expected number of results
300    fn estimate_path_complexity(&self, path: &crate::query::algebra::PropertyPath) -> (f64, usize) {
301        use crate::query::algebra::PropertyPath;
302
303        match path {
304            // Simple predicate - lowest complexity
305            PropertyPath::Predicate(_) => {
306                let card = (self.stats.total_triples() / 10).max(1); // Heuristic: ~10% selectivity
307                (1.0, card)
308            }
309
310            // Inverse - slightly more complex than simple predicate
311            PropertyPath::Inverse(inner) => {
312                let (inner_complexity, inner_card) = self.estimate_path_complexity(inner);
313                (inner_complexity * 1.2, inner_card)
314            }
315
316            // Sequence - multiply complexities
317            PropertyPath::Sequence(left, right) => {
318                let (left_complexity, left_card) = self.estimate_path_complexity(left);
319                let (right_complexity, _) = self.estimate_path_complexity(right);
320                let complexity = left_complexity * right_complexity;
321                // Sequence generally reduces cardinality
322                let card = (left_card as f64 * 0.5) as usize;
323                (complexity, card.max(1))
324            }
325
326            // Alternative - sum complexities
327            PropertyPath::Alternative(left, right) => {
328                let (left_complexity, left_card) = self.estimate_path_complexity(left);
329                let (right_complexity, right_card) = self.estimate_path_complexity(right);
330                let complexity = (left_complexity + right_complexity) / 2.0;
331                // Alternative increases cardinality
332                let card = left_card + right_card;
333                (complexity, card)
334            }
335
336            // Transitive closure - very high complexity
337            PropertyPath::ZeroOrMore(inner) => {
338                let (inner_complexity, _) = self.estimate_path_complexity(inner);
339                // Transitive queries can be very expensive
340                let complexity = inner_complexity * 50.0;
341                // Potentially many results (estimate graph diameter * average degree)
342                let card = (self.stats.total_triples() as f64 * 0.3) as usize;
343                (complexity.min(100.0), card.max(1))
344            }
345
346            // One or more - similar to zero or more but slightly less complex
347            PropertyPath::OneOrMore(inner) => {
348                let (inner_complexity, _) = self.estimate_path_complexity(inner);
349                let complexity = inner_complexity * 30.0;
350                let card = (self.stats.total_triples() as f64 * 0.2) as usize;
351                (complexity.min(100.0), card.max(1))
352            }
353
354            // Optional - slightly increases complexity
355            PropertyPath::ZeroOrOne(inner) => {
356                let (inner_complexity, inner_card) = self.estimate_path_complexity(inner);
357                // Optional doubles potential results (original + matched)
358                (inner_complexity * 1.5, (inner_card as f64 * 1.2) as usize)
359            }
360
361            // Negated property set - moderate complexity
362            PropertyPath::NegatedPropertySet(props) => {
363                // Complexity depends on number of properties to exclude
364                let complexity = 2.0 + (props.len() as f64 * 0.5);
365                // Cardinality is typically large (everything except excluded)
366                let card = (self.stats.total_triples() as f64 * 0.8) as usize;
367                (complexity, card.max(1))
368            }
369        }
370    }
371
372    // Helper methods for cost estimation
373
374    fn estimate_pattern_cost(&self, pattern: &AlgebraTriplePattern) -> PatternCost {
375        // Try to use histogram-based cardinality estimation from advanced stats
376        let estimated_card =
377            if let Some(hist_card) = self.advanced_stats.estimate_cardinality(pattern) {
378                // Use histogram-based estimate (median of observed cardinalities)
379                hist_card
380            } else {
381                // Fall back to heuristic selectivity-based estimation
382                let selectivity = self.calculate_selectivity(pattern);
383                (self.stats.total_triples() as f64 * selectivity) as usize
384            };
385
386        // I/O cost: depends on whether we can use an index
387        let io_cost = if self.can_use_index(pattern) {
388            estimated_card as f64 * self.cost_config.index_access_cost
389        } else {
390            self.stats.total_triples() as f64 * self.cost_config.sequential_scan_cost
391        };
392
393        // CPU cost: processing retrieved triples
394        let cpu_cost = estimated_card as f64 * self.cost_config.cpu_tuple_cost;
395
396        PatternCost {
397            estimated_cardinality: estimated_card,
398            io_cost,
399            cpu_cost,
400            total_cost: io_cost + cpu_cost,
401        }
402    }
403
404    fn calculate_selectivity(&self, pattern: &AlgebraTriplePattern) -> f64 {
405        let mut selectivity = 1.0;
406
407        // Adjust selectivity based on bound terms
408        match &pattern.subject {
409            TermPattern::Variable(_) => selectivity *= 0.5, // Variable is less selective
410            _ => selectivity *= 0.01,                       // Constant is very selective
411        }
412
413        match &pattern.predicate {
414            TermPattern::Variable(_) => selectivity *= 0.5,
415            _ => selectivity *= 0.1,
416        }
417
418        match &pattern.object {
419            TermPattern::Variable(_) => selectivity *= 0.5,
420            _ => selectivity *= 0.01,
421        }
422
423        selectivity
424    }
425
426    fn can_use_index(&self, pattern: &AlgebraTriplePattern) -> bool {
427        // Can use index if at least one term is bound
428        !matches!(pattern.subject, TermPattern::Variable(_))
429            || !matches!(pattern.predicate, TermPattern::Variable(_))
430            || !matches!(pattern.object, TermPattern::Variable(_))
431    }
432
433    fn estimate_bgp_cardinality(
434        &self,
435        _patterns: &[AlgebraTriplePattern],
436        pattern_costs: &[(usize, PatternCost)],
437    ) -> usize {
438        if pattern_costs.is_empty() {
439            return 0;
440        }
441
442        // Estimate using product of selectivities with correlation factor
443        let mut cardinality = pattern_costs[0].1.estimated_cardinality as f64;
444
445        for (_, cost) in pattern_costs.iter().skip(1) {
446            let join_selectivity = 0.1; // Heuristic for join selectivity
447            cardinality *= join_selectivity * (cost.estimated_cardinality as f64);
448        }
449
450        cardinality.max(1.0) as usize
451    }
452
453    fn should_use_index(&self, patterns: &[AlgebraTriplePattern]) -> bool {
454        // Use index if any pattern has bound terms
455        patterns.iter().any(|p| self.can_use_index(p))
456    }
457
458    fn should_parallelize(&self, _patterns: &[AlgebraTriplePattern], total_cost: f64) -> bool {
459        total_cost > self.cost_config.parallel_threshold
460    }
461
462    fn estimate_join_cost(&self, left: &OptimizedPlan, right: &OptimizedPlan) -> f64 {
463        // Hash join cost: build hash table + probe
464        let build_cost = left.estimated_cardinality as f64 * self.cost_config.hash_build_cost;
465        let probe_cost = right.estimated_cardinality as f64 * self.cost_config.hash_probe_cost;
466
467        build_cost + probe_cost
468    }
469
470    fn estimate_join_cardinality(&self, left: &OptimizedPlan, right: &OptimizedPlan) -> usize {
471        // Use adaptive join selectivity from advanced statistics (learned from execution history)
472        // Falls back to default 0.1 if no historical data available
473        let join_selectivity = self
474            .advanced_stats
475            .estimate_join_selectivity(left.estimated_cardinality, right.estimated_cardinality);
476        let product = left.estimated_cardinality as f64 * right.estimated_cardinality as f64;
477
478        (product * join_selectivity).max(1.0) as usize
479    }
480
481    /// Get optimizer statistics
482    pub fn stats(&self) -> OptimizerStats {
483        OptimizerStats {
484            queries_optimized: self.query_count.load(Ordering::Relaxed),
485            total_triples: self.stats.total_triples(),
486        }
487    }
488
489    /// Get advanced statistics including histogram and join selectivity data
490    ///
491    /// Returns comprehensive statistics about the optimizer's learned knowledge,
492    /// including histogram sizes, join samples, and execution history.
493    pub fn advanced_stats(&self) -> crate::query::advanced_statistics::AdvancedStatistics {
494        self.advanced_stats.get_statistics()
495    }
496
497    /// Get execution history for a specific pattern
498    ///
499    /// Returns all recorded executions for patterns similar to the given pattern.
500    /// Useful for debugging and understanding query performance over time.
501    pub fn get_pattern_history(
502        &self,
503        pattern: &AlgebraTriplePattern,
504    ) -> Vec<crate::query::advanced_statistics::PatternExecution> {
505        self.advanced_stats.get_pattern_history(pattern)
506    }
507
508    /// Clear all accumulated statistics
509    ///
510    /// Resets both legacy and advanced statistics collectors to initial state.
511    /// Useful for testing or when starting fresh after significant data changes.
512    pub fn clear_statistics(&self) {
513        self.advanced_stats.clear();
514        self.query_count.store(0, Ordering::Relaxed);
515    }
516
517    /// Update statistics with actual query results
518    ///
519    /// Feeds actual execution results to both legacy and advanced statistics collectors.
520    /// The advanced collector uses histogram-based tracking (median of 100 observations)
521    /// while the legacy collector uses exponential moving average for backward compatibility.
522    ///
523    /// # Arguments
524    ///
525    /// * `pattern` - The query pattern that was executed
526    /// * `actual_cardinality` - The actual number of results returned
527    pub fn update_stats(&self, pattern: &GraphPattern, actual_cardinality: usize) {
528        self.update_stats_with_time(pattern, actual_cardinality, 0);
529    }
530
531    /// Update statistics with execution time tracking
532    ///
533    /// Extended version of `update_stats` that also records execution time for
534    /// performance profiling and optimization hint generation.
535    pub fn update_stats_with_time(
536        &self,
537        pattern: &GraphPattern,
538        actual_cardinality: usize,
539        execution_time_ms: u64,
540    ) {
541        // Update advanced statistics collector with pattern executions
542        if let GraphPattern::Bgp(patterns) = pattern {
543            for triple_pattern in patterns {
544                self.advanced_stats.record_pattern_execution(
545                    triple_pattern,
546                    actual_cardinality,
547                    execution_time_ms,
548                );
549            }
550        }
551
552        // Update legacy statistics for backward compatibility
553        const ALPHA: f64 = 0.2;
554        let pattern_hash = self.compute_pattern_hash(pattern);
555        let mut pattern_stats = self
556            .stats
557            .pattern_stats
558            .write()
559            .expect("lock should not be poisoned");
560
561        let entry = pattern_stats
562            .entry(pattern_hash)
563            .or_insert_with(|| PatternStats {
564                execution_count: 0,
565                avg_cardinality: actual_cardinality as f64,
566                last_cardinality: actual_cardinality,
567            });
568
569        entry.avg_cardinality =
570            ALPHA * (actual_cardinality as f64) + (1.0 - ALPHA) * entry.avg_cardinality;
571        entry.last_cardinality = actual_cardinality;
572        entry.execution_count += 1;
573
574        // Log significant deviations for performance tuning
575        if entry.execution_count > 5 {
576            let deviation =
577                (actual_cardinality as f64 - entry.avg_cardinality).abs() / entry.avg_cardinality;
578            if deviation > 0.5 {
579                tracing::debug!(
580                    pattern_hash = pattern_hash,
581                    actual = actual_cardinality,
582                    avg = entry.avg_cardinality,
583                    deviation_pct = deviation * 100.0,
584                    "Significant cardinality deviation detected"
585                );
586            }
587        }
588    }
589
590    /// Record join execution for adaptive join selectivity learning
591    ///
592    /// Feeds join execution results to the advanced statistics collector
593    /// to improve join selectivity estimates over time.
594    pub fn record_join_execution(
595        &self,
596        left_pattern: &GraphPattern,
597        right_pattern: &GraphPattern,
598        left_cardinality: usize,
599        right_cardinality: usize,
600        result_cardinality: usize,
601    ) {
602        self.advanced_stats.record_join_execution(
603            left_pattern,
604            right_pattern,
605            left_cardinality,
606            right_cardinality,
607            result_cardinality,
608        );
609    }
610
611    /// Compute a hash for a graph pattern for statistics tracking
612    fn compute_pattern_hash(&self, pattern: &GraphPattern) -> u64 {
613        use std::collections::hash_map::DefaultHasher;
614        use std::hash::{Hash, Hasher};
615
616        let mut hasher = DefaultHasher::new();
617
618        // Hash the pattern structure (simplified version)
619        match pattern {
620            GraphPattern::Bgp(patterns) => {
621                "bgp".hash(&mut hasher);
622                patterns.len().hash(&mut hasher);
623            }
624            GraphPattern::Join(_, _) => {
625                "join".hash(&mut hasher);
626            }
627            GraphPattern::LeftJoin { .. } => {
628                "leftjoin".hash(&mut hasher);
629            }
630            GraphPattern::Union(_, _) => {
631                "union".hash(&mut hasher);
632            }
633            GraphPattern::Filter { .. } => {
634                "filter".hash(&mut hasher);
635            }
636            GraphPattern::Extend { .. } => {
637                "extend".hash(&mut hasher);
638            }
639            GraphPattern::Group { .. } => {
640                "group".hash(&mut hasher);
641            }
642            GraphPattern::Service { .. } => {
643                "service".hash(&mut hasher);
644            }
645            GraphPattern::Minus(_, _) => {
646                "minus".hash(&mut hasher);
647            }
648            GraphPattern::Graph { .. } => {
649                "graph".hash(&mut hasher);
650            }
651            GraphPattern::OrderBy { .. } => {
652                "orderby".hash(&mut hasher);
653            }
654            GraphPattern::Project { .. } => {
655                "project".hash(&mut hasher);
656            }
657            GraphPattern::Distinct(_) => {
658                "distinct".hash(&mut hasher);
659            }
660            GraphPattern::Reduced(_) => {
661                "reduced".hash(&mut hasher);
662            }
663            GraphPattern::Slice { .. } => {
664                "slice".hash(&mut hasher);
665            }
666            GraphPattern::Path { .. } => {
667                "path".hash(&mut hasher);
668            }
669            GraphPattern::Values { .. } => {
670                "values".hash(&mut hasher);
671            }
672        }
673
674        hasher.finish()
675    }
676
677    /// Get learned statistics for a pattern
678    pub fn get_learned_cardinality(&self, pattern: &GraphPattern) -> Option<f64> {
679        let pattern_hash = self.compute_pattern_hash(pattern);
680        let pattern_stats = self
681            .stats
682            .pattern_stats
683            .read()
684            .expect("lock should not be poisoned");
685        pattern_stats.get(&pattern_hash).map(|s| s.avg_cardinality)
686    }
687
688    /// Export optimized plan as QueryPlanNode for visualization
689    ///
690    /// Converts the optimized execution plan into a visual tree structure
691    /// that can be rendered using the QueryPlanVisualizer.
692    ///
693    /// # Example
694    /// ```rust,no_run
695    /// use oxirs_core::query::cost_based_optimizer::CostBasedOptimizer;
696    /// use oxirs_core::query::query_plan_visualizer::QueryPlanVisualizer;
697    /// # use oxirs_core::query::algebra::GraphPattern;
698    ///
699    /// # fn example() -> Result<(), oxirs_core::OxirsError> {
700    /// let optimizer = CostBasedOptimizer::new();
701    /// # let pattern = GraphPattern::Values { variables: vec![], bindings: vec![] };
702    /// let plan = optimizer.optimize_pattern(&pattern)?;
703    /// let visual_plan = optimizer.to_visual_plan(&pattern, &plan);
704    ///
705    /// let visualizer = QueryPlanVisualizer::new();
706    /// println!("{}", visualizer.visualize_as_tree(&visual_plan));
707    /// # Ok(())
708    /// # }
709    /// ```
710    pub fn to_visual_plan(&self, pattern: &GraphPattern, plan: &OptimizedPlan) -> QueryPlanNode {
711        self.pattern_to_visual_node(pattern, Some(plan))
712    }
713
714    /// Recursively convert a GraphPattern into a QueryPlanNode
715    fn pattern_to_visual_node(
716        &self,
717        pattern: &GraphPattern,
718        plan: Option<&OptimizedPlan>,
719    ) -> QueryPlanNode {
720        match pattern {
721            GraphPattern::Bgp(patterns) => {
722                let mut node =
723                    QueryPlanNode::new("BGP", format!("{} triple patterns", patterns.len()))
724                        .with_metadata("pattern_count", patterns.len().to_string());
725
726                if let Some(p) = plan {
727                    node = node
728                        .with_estimated_cardinality(p.estimated_cardinality)
729                        .with_estimated_cost(p.estimated_cost);
730
731                    if p.use_index {
732                        node = node.with_index("SPO/POS/OSP");
733                    }
734                }
735
736                // Add child nodes for each triple pattern in optimal order
737                for (i, idx) in plan
738                    .map(|p| &p.join_order)
739                    .unwrap_or(&vec![])
740                    .iter()
741                    .enumerate()
742                {
743                    if let Some(triple_pattern) = patterns.get(*idx) {
744                        let pattern_desc = format!(
745                            "{} {} {}",
746                            Self::term_pattern_to_string(&triple_pattern.subject),
747                            Self::term_pattern_to_string(&triple_pattern.predicate),
748                            Self::term_pattern_to_string(&triple_pattern.object)
749                        );
750
751                        let cost = self.estimate_pattern_cost(triple_pattern);
752                        let mut child = QueryPlanNode::new("TriplePattern", pattern_desc)
753                            .with_estimated_cardinality(cost.estimated_cardinality)
754                            .with_estimated_cost(cost.total_cost)
755                            .with_metadata("order", (i + 1).to_string());
756
757                        if self.can_use_index(triple_pattern) {
758                            child = child.with_index(self.suggest_index(triple_pattern));
759                        }
760
761                        node.add_child(child);
762                    }
763                }
764
765                node
766            }
767            GraphPattern::Join(left, right) => {
768                let mut node = QueryPlanNode::new("Join", "Hash Join");
769
770                if let Some(p) = plan {
771                    node = node
772                        .with_estimated_cardinality(p.estimated_cardinality)
773                        .with_estimated_cost(p.estimated_cost);
774
775                    if p.parallel_execution {
776                        node = node.with_metadata("execution", "parallel");
777                    }
778                }
779
780                // Add children in optimal order
781                let left_plan = self.optimize_pattern(left).ok();
782                let right_plan = self.optimize_pattern(right).ok();
783
784                let swapped = plan.map(|p| p.join_order == vec![1, 0]).unwrap_or(false);
785
786                if swapped {
787                    node.add_child(self.pattern_to_visual_node(right, right_plan.as_ref()));
788                    node.add_child(self.pattern_to_visual_node(left, left_plan.as_ref()));
789                } else {
790                    node.add_child(self.pattern_to_visual_node(left, left_plan.as_ref()));
791                    node.add_child(self.pattern_to_visual_node(right, right_plan.as_ref()));
792                }
793
794                node
795            }
796            GraphPattern::LeftJoin { left, right, .. } => {
797                let mut node = QueryPlanNode::new("LeftJoin", "Optional Pattern");
798
799                if let Some(p) = plan {
800                    node = node
801                        .with_estimated_cardinality(p.estimated_cardinality)
802                        .with_estimated_cost(p.estimated_cost);
803                }
804
805                let left_plan = self.optimize_pattern(left).ok();
806                let right_plan = self.optimize_pattern(right).ok();
807
808                node.add_child(self.pattern_to_visual_node(left, left_plan.as_ref()));
809                node.add_child(self.pattern_to_visual_node(right, right_plan.as_ref()));
810
811                node
812            }
813            GraphPattern::Filter { expr: _, inner } => {
814                let inner_plan = self.optimize_pattern(inner).ok();
815                let mut node = QueryPlanNode::new("Filter", "Filter expression");
816
817                if let Some(p) = plan {
818                    node = node
819                        .with_estimated_cardinality(p.estimated_cardinality)
820                        .with_estimated_cost(p.estimated_cost)
821                        .with_metadata(
822                            "selectivity",
823                            format!("{:.2}", self.cost_config.filter_selectivity),
824                        );
825                }
826
827                node.add_child(self.pattern_to_visual_node(inner, inner_plan.as_ref()));
828                node
829            }
830            GraphPattern::Union(left, right) => {
831                let mut node = QueryPlanNode::new("Union", "Parallel Union");
832
833                if let Some(p) = plan {
834                    node = node
835                        .with_estimated_cardinality(p.estimated_cardinality)
836                        .with_estimated_cost(p.estimated_cost)
837                        .with_metadata("execution", "parallel");
838                }
839
840                let left_plan = self.optimize_pattern(left).ok();
841                let right_plan = self.optimize_pattern(right).ok();
842
843                node.add_child(self.pattern_to_visual_node(left, left_plan.as_ref()));
844                node.add_child(self.pattern_to_visual_node(right, right_plan.as_ref()));
845
846                node
847            }
848            GraphPattern::Minus(left, right) => {
849                let mut node = QueryPlanNode::new("Minus", "Hash Anti-Join");
850
851                if let Some(p) = plan {
852                    node = node
853                        .with_estimated_cardinality(p.estimated_cardinality)
854                        .with_estimated_cost(p.estimated_cost);
855                }
856
857                let left_plan = self.optimize_pattern(left).ok();
858                let right_plan = self.optimize_pattern(right).ok();
859
860                node.add_child(self.pattern_to_visual_node(left, left_plan.as_ref()));
861                node.add_child(self.pattern_to_visual_node(right, right_plan.as_ref()));
862
863                node
864            }
865            GraphPattern::Extend { inner, .. } => {
866                let inner_plan = self.optimize_pattern(inner).ok();
867                let mut node = QueryPlanNode::new("Extend", "Variable binding");
868
869                if let Some(p) = plan {
870                    node = node.with_estimated_cardinality(p.estimated_cardinality);
871                }
872
873                node.add_child(self.pattern_to_visual_node(inner, inner_plan.as_ref()));
874                node
875            }
876            GraphPattern::Service { .. } => QueryPlanNode::new("Service", "Federated query")
877                .with_metadata("type", "remote")
878                .with_metadata("note", "actual_cardinality_depends_on_remote_endpoint"),
879            GraphPattern::Graph { inner, .. } => {
880                let inner_plan = self.optimize_pattern(inner).ok();
881                let mut node = QueryPlanNode::new("Graph", "Named graph access");
882
883                if let Some(p) = plan {
884                    node = node.with_estimated_cardinality(p.estimated_cardinality);
885                }
886
887                node.add_child(self.pattern_to_visual_node(inner, inner_plan.as_ref()));
888                node
889            }
890            GraphPattern::OrderBy { inner, .. } => {
891                let inner_plan = self.optimize_pattern(inner).ok();
892                let mut node = QueryPlanNode::new("OrderBy", "Sort operation");
893
894                if let Some(p) = plan {
895                    node = node.with_estimated_cardinality(p.estimated_cardinality);
896                }
897
898                node.add_child(self.pattern_to_visual_node(inner, inner_plan.as_ref()));
899                node
900            }
901            GraphPattern::Project { inner, variables } => {
902                let inner_plan = self.optimize_pattern(inner).ok();
903                let mut node =
904                    QueryPlanNode::new("Project", format!("{} variables", variables.len()));
905
906                if let Some(p) = plan {
907                    node = node.with_estimated_cardinality(p.estimated_cardinality);
908                }
909
910                node.add_child(self.pattern_to_visual_node(inner, inner_plan.as_ref()));
911                node
912            }
913            GraphPattern::Distinct(inner) => {
914                let inner_plan = self.optimize_pattern(inner).ok();
915                let mut node = QueryPlanNode::new("Distinct", "Remove duplicates");
916
917                if let Some(p) = plan {
918                    node = node.with_estimated_cardinality(p.estimated_cardinality);
919                }
920
921                node.add_child(self.pattern_to_visual_node(inner, inner_plan.as_ref()));
922                node
923            }
924            GraphPattern::Reduced(inner) => {
925                let inner_plan = self.optimize_pattern(inner).ok();
926                let mut node = QueryPlanNode::new("Reduced", "Best-effort deduplication");
927
928                if let Some(p) = plan {
929                    node = node.with_estimated_cardinality(p.estimated_cardinality);
930                }
931
932                node.add_child(self.pattern_to_visual_node(inner, inner_plan.as_ref()));
933                node
934            }
935            GraphPattern::Slice {
936                inner,
937                offset,
938                limit,
939            } => {
940                let inner_plan = self.optimize_pattern(inner).ok();
941                let limit_str = limit
942                    .map(|l| l.to_string())
943                    .unwrap_or_else(|| "∞".to_string());
944                let mut node =
945                    QueryPlanNode::new("Slice", format!("LIMIT {} OFFSET {}", limit_str, offset));
946
947                if let Some(p) = plan {
948                    let limited_card = if let Some(lim) = limit {
949                        (*lim).min(p.estimated_cardinality.saturating_sub(*offset))
950                    } else {
951                        p.estimated_cardinality.saturating_sub(*offset)
952                    };
953                    node = node.with_estimated_cardinality(limited_card);
954                }
955
956                node.add_child(self.pattern_to_visual_node(inner, inner_plan.as_ref()));
957                node
958            }
959            GraphPattern::Group { inner, .. } => {
960                let inner_plan = self.optimize_pattern(inner).ok();
961                let mut node = QueryPlanNode::new("Group", "GROUP BY aggregation");
962
963                if let Some(p) = plan {
964                    node = node.with_estimated_cardinality(p.estimated_cardinality);
965                }
966
967                node.add_child(self.pattern_to_visual_node(inner, inner_plan.as_ref()));
968                node
969            }
970            GraphPattern::Path { .. } => {
971                QueryPlanNode::new("PropertyPath", "Property path traversal")
972                    .with_metadata("note", "cardinality_depends_on_path_length")
973            }
974            GraphPattern::Values { bindings, .. } => {
975                QueryPlanNode::new("Values", format!("{} bindings", bindings.len()))
976                    .with_estimated_cardinality(bindings.len())
977                    .with_estimated_cost(0.0)
978            }
979        }
980    }
981
982    /// Convert TermPattern to string for display
983    fn term_pattern_to_string(term: &TermPattern) -> String {
984        match term {
985            TermPattern::Variable(v) => format!("?{}", v),
986            TermPattern::NamedNode(n) => {
987                // Shorten URIs for readability
988                let uri = n.as_str();
989                if let Some(local) = uri.rsplit('/').next() {
990                    local.to_string()
991                } else {
992                    uri.to_string()
993                }
994            }
995            TermPattern::BlankNode(b) => format!("_:{}", b),
996            TermPattern::Literal(l) => {
997                let value = l.value();
998                if value.len() > 20 {
999                    format!("\"{}...\"", &value[..17])
1000                } else {
1001                    format!("\"{}\"", value)
1002                }
1003            }
1004            TermPattern::QuotedTriple(t) => format!(
1005                "<<{} {} {}>>",
1006                Self::term_pattern_to_string(&t.subject),
1007                Self::term_pattern_to_string(&t.predicate),
1008                Self::term_pattern_to_string(&t.object)
1009            ),
1010        }
1011    }
1012
1013    /// Suggest best index for a triple pattern
1014    fn suggest_index(&self, pattern: &AlgebraTriplePattern) -> String {
1015        let s_bound = !matches!(pattern.subject, TermPattern::Variable(_));
1016        let p_bound = !matches!(pattern.predicate, TermPattern::Variable(_));
1017        let o_bound = !matches!(pattern.object, TermPattern::Variable(_));
1018
1019        match (s_bound, p_bound, o_bound) {
1020            (true, _, _) => "SPO",
1021            (false, true, _) => "POS",
1022            (false, false, true) => "OSP",
1023            _ => "FullScan",
1024        }
1025        .to_string()
1026    }
1027}
1028
1029impl Default for CostBasedOptimizer {
1030    fn default() -> Self {
1031        Self::new()
1032    }
1033}
1034
1035/// Statistics collector for cost estimation
1036struct StatisticsCollector {
1037    /// Total number of triples in the store
1038    total_triples: AtomicU64,
1039    /// Statistics per predicate (reserved for future use)
1040    #[allow(dead_code)]
1041    predicate_stats: HashMap<String, PredicateStats>,
1042    /// Pattern-specific statistics for adaptive optimization
1043    pattern_stats: Arc<RwLock<HashMap<u64, PatternStats>>>,
1044}
1045
1046impl StatisticsCollector {
1047    fn new() -> Self {
1048        Self {
1049            total_triples: AtomicU64::new(1_000_000), // Default estimate
1050            predicate_stats: HashMap::new(),
1051            pattern_stats: Arc::new(RwLock::new(HashMap::new())),
1052        }
1053    }
1054
1055    fn total_triples(&self) -> usize {
1056        self.total_triples.load(Ordering::Relaxed) as usize
1057    }
1058}
1059
1060/// Statistics tracked for specific query patterns
1061#[derive(Debug, Clone)]
1062struct PatternStats {
1063    /// Number of times this pattern has been executed
1064    execution_count: usize,
1065    /// Average cardinality (exponential moving average)
1066    avg_cardinality: f64,
1067    /// Last observed cardinality
1068    last_cardinality: usize,
1069}
1070
1071/// Statistics for a specific predicate
1072#[derive(Debug, Clone)]
1073#[allow(dead_code)]
1074struct PredicateStats {
1075    /// Number of triples with this predicate
1076    count: usize,
1077    /// Average number of objects per subject
1078    avg_objects_per_subject: f64,
1079    /// Distinct subjects
1080    distinct_subjects: usize,
1081    /// Distinct objects
1082    distinct_objects: usize,
1083}
1084
1085/// Cost configuration parameters
1086#[derive(Debug, Clone)]
1087pub struct CostConfiguration {
1088    /// Cost per sequential scan of one triple
1089    pub sequential_scan_cost: f64,
1090    /// Cost per index access
1091    pub index_access_cost: f64,
1092    /// Cost per tuple processed by CPU
1093    pub cpu_tuple_cost: f64,
1094    /// Cost to build hash table per tuple
1095    pub hash_build_cost: f64,
1096    /// Cost to probe hash table per tuple
1097    pub hash_probe_cost: f64,
1098    /// Default filter selectivity
1099    pub filter_selectivity: f64,
1100    /// Threshold for parallel execution
1101    pub parallel_threshold: f64,
1102}
1103
1104impl Default for CostConfiguration {
1105    fn default() -> Self {
1106        Self {
1107            sequential_scan_cost: 1.0,
1108            index_access_cost: 0.01,
1109            cpu_tuple_cost: 0.001,
1110            hash_build_cost: 0.005,
1111            hash_probe_cost: 0.002,
1112            filter_selectivity: 0.3,
1113            parallel_threshold: 10000.0,
1114        }
1115    }
1116}
1117
1118/// Cost estimate for a single triple pattern
1119#[derive(Debug, Clone)]
1120struct PatternCost {
1121    /// Estimated result cardinality
1122    estimated_cardinality: usize,
1123    /// I/O cost (reserved for detailed cost models)
1124    #[allow(dead_code)]
1125    io_cost: f64,
1126    /// CPU cost (reserved for detailed cost models)
1127    #[allow(dead_code)]
1128    cpu_cost: f64,
1129    /// Total cost
1130    total_cost: f64,
1131}
1132
1133/// Optimized execution plan
1134#[derive(Debug, Clone)]
1135pub struct OptimizedPlan {
1136    /// Optimal join order (indices)
1137    pub join_order: Vec<usize>,
1138    /// Estimated execution cost
1139    pub estimated_cost: f64,
1140    /// Estimated result cardinality
1141    pub estimated_cardinality: usize,
1142    /// Whether to use index access
1143    pub use_index: bool,
1144    /// Whether to use parallel execution
1145    pub parallel_execution: bool,
1146    /// Applied optimizations
1147    pub optimizations: Vec<Optimization>,
1148}
1149
1150impl OptimizedPlan {
1151    /// Create an empty plan
1152    fn empty() -> Self {
1153        Self {
1154            join_order: vec![],
1155            estimated_cost: 0.0,
1156            estimated_cardinality: 0,
1157            use_index: false,
1158            parallel_execution: false,
1159            optimizations: vec![],
1160        }
1161    }
1162}
1163
1164/// Types of optimizations applied
1165#[derive(Debug, Clone, PartialEq, Eq)]
1166pub enum Optimization {
1167    /// Reordered joins for optimal execution
1168    JoinReordering,
1169    /// Used hash join algorithm
1170    HashJoin,
1171    /// Pushed filter down to reduce intermediate results
1172    FilterPushdown,
1173    /// Used index for access
1174    IndexAccess,
1175    /// Used nested loop join with index
1176    IndexNLJ,
1177    /// Parallel union execution
1178    ParallelUnion,
1179    /// Hash-based anti-join for MINUS
1180    HashAntiJoin,
1181    /// Property path evaluation with complexity-based cost estimation
1182    PropertyPathEvaluation,
1183}
1184
1185/// Optimizer statistics
1186#[derive(Debug, Clone)]
1187pub struct OptimizerStats {
1188    /// Number of queries optimized
1189    pub queries_optimized: u64,
1190    /// Total triples in the store
1191    pub total_triples: usize,
1192}
1193
1194#[cfg(test)]
1195mod tests {
1196    use super::*;
1197    use crate::model::{NamedNode, Variable};
1198    use crate::query::algebra::TermPattern;
1199
1200    fn create_test_pattern(
1201        subject: TermPattern,
1202        predicate: TermPattern,
1203        object: TermPattern,
1204    ) -> AlgebraTriplePattern {
1205        AlgebraTriplePattern {
1206            subject,
1207            predicate,
1208            object,
1209        }
1210    }
1211
1212    #[test]
1213    fn test_optimizer_creation() {
1214        let optimizer = CostBasedOptimizer::new();
1215        let stats = optimizer.stats();
1216
1217        assert_eq!(stats.queries_optimized, 0);
1218        assert!(stats.total_triples > 0);
1219    }
1220
1221    #[test]
1222    fn test_selectivity_calculation() {
1223        let optimizer = CostBasedOptimizer::new();
1224
1225        // All variables (least selective)
1226        let pattern1 = create_test_pattern(
1227            TermPattern::Variable(Variable::new("s").expect("valid variable name")),
1228            TermPattern::Variable(Variable::new("p").expect("valid variable name")),
1229            TermPattern::Variable(Variable::new("o").expect("valid variable name")),
1230        );
1231
1232        let cost1 = optimizer.estimate_pattern_cost(&pattern1);
1233        assert!(cost1.estimated_cardinality > 0);
1234
1235        // All constants (most selective)
1236        let pattern2 = create_test_pattern(
1237            TermPattern::NamedNode(NamedNode::new("http://example.org/s").expect("valid IRI")),
1238            TermPattern::NamedNode(NamedNode::new("http://example.org/p").expect("valid IRI")),
1239            TermPattern::NamedNode(NamedNode::new("http://example.org/o").expect("valid IRI")),
1240        );
1241
1242        let cost2 = optimizer.estimate_pattern_cost(&pattern2);
1243
1244        // Constant pattern should be more selective
1245        assert!(cost2.estimated_cardinality < cost1.estimated_cardinality);
1246    }
1247
1248    #[test]
1249    fn test_cost_configuration() {
1250        let config = CostConfiguration::default();
1251
1252        assert!(config.index_access_cost < config.sequential_scan_cost);
1253        assert!(config.cpu_tuple_cost < config.index_access_cost);
1254    }
1255
1256    #[test]
1257    fn test_can_use_index() {
1258        let optimizer = CostBasedOptimizer::new();
1259
1260        // Pattern with bound subject
1261        let pattern1 = create_test_pattern(
1262            TermPattern::NamedNode(NamedNode::new("http://example.org/s").expect("valid IRI")),
1263            TermPattern::Variable(Variable::new("p").expect("valid variable name")),
1264            TermPattern::Variable(Variable::new("o").expect("valid variable name")),
1265        );
1266
1267        assert!(optimizer.can_use_index(&pattern1));
1268
1269        // Pattern with all variables
1270        let pattern2 = create_test_pattern(
1271            TermPattern::Variable(Variable::new("s").expect("valid variable name")),
1272            TermPattern::Variable(Variable::new("p").expect("valid variable name")),
1273            TermPattern::Variable(Variable::new("o").expect("valid variable name")),
1274        );
1275
1276        assert!(!optimizer.can_use_index(&pattern2));
1277    }
1278
1279    #[test]
1280    fn test_empty_plan() {
1281        let plan = OptimizedPlan::empty();
1282
1283        assert_eq!(plan.join_order.len(), 0);
1284        assert_eq!(plan.estimated_cost, 0.0);
1285        assert_eq!(plan.estimated_cardinality, 0);
1286    }
1287}