ruvector_graph/executor/
stats.rs

1//! Statistics collection for cost-based query optimization
2//!
3//! Maintains table and column statistics for query planning
4
5use std::collections::HashMap;
6use std::sync::RwLock;
7
8/// Statistics manager for query optimization
9pub struct Statistics {
10    /// Table-level statistics
11    tables: RwLock<HashMap<String, TableStats>>,
12    /// Column-level statistics
13    columns: RwLock<HashMap<String, ColumnStats>>,
14}
15
16impl Statistics {
17    /// Create a new statistics manager
18    pub fn new() -> Self {
19        Self {
20            tables: RwLock::new(HashMap::new()),
21            columns: RwLock::new(HashMap::new()),
22        }
23    }
24
25    /// Update table statistics
26    pub fn update_table_stats(&self, table_name: String, stats: TableStats) {
27        self.tables.write().unwrap().insert(table_name, stats);
28    }
29
30    /// Get table statistics
31    pub fn get_table_stats(&self, table_name: &str) -> Option<TableStats> {
32        self.tables.read().unwrap().get(table_name).cloned()
33    }
34
35    /// Update column statistics
36    pub fn update_column_stats(&self, column_key: String, stats: ColumnStats) {
37        self.columns.write().unwrap().insert(column_key, stats);
38    }
39
40    /// Get column statistics
41    pub fn get_column_stats(&self, column_key: &str) -> Option<ColumnStats> {
42        self.columns.read().unwrap().get(column_key).cloned()
43    }
44
45    /// Check if statistics are empty
46    pub fn is_empty(&self) -> bool {
47        self.tables.read().unwrap().is_empty() && self.columns.read().unwrap().is_empty()
48    }
49
50    /// Clear all statistics
51    pub fn clear(&self) {
52        self.tables.write().unwrap().clear();
53        self.columns.write().unwrap().clear();
54    }
55
56    /// Estimate join selectivity
57    pub fn estimate_join_selectivity(
58        &self,
59        left_table: &str,
60        right_table: &str,
61        join_column: &str,
62    ) -> f64 {
63        let left_stats = self.get_table_stats(left_table);
64        let right_stats = self.get_table_stats(right_table);
65
66        if let (Some(left), Some(right)) = (left_stats, right_stats) {
67            // Simple selectivity estimate based on cardinalities
68            let left_ndv = left.row_count as f64;
69            let right_ndv = right.row_count as f64;
70
71            if left_ndv > 0.0 && right_ndv > 0.0 {
72                1.0 / left_ndv.max(right_ndv)
73            } else {
74                0.1 // Default selectivity
75            }
76        } else {
77            0.1 // Default selectivity when stats not available
78        }
79    }
80
81    /// Estimate filter selectivity
82    pub fn estimate_filter_selectivity(&self, column_key: &str, operator: &str) -> f64 {
83        if let Some(stats) = self.get_column_stats(column_key) {
84            match operator {
85                "=" => 1.0 / stats.ndv.max(1) as f64,
86                ">" | "<" => 0.33,
87                ">=" | "<=" => 0.33,
88                "!=" => 1.0 - (1.0 / stats.ndv.max(1) as f64),
89                "LIKE" => 0.1,
90                "IN" => 0.2,
91                _ => 0.1,
92            }
93        } else {
94            0.1 // Default selectivity
95        }
96    }
97}
98
99impl Default for Statistics {
100    fn default() -> Self {
101        Self::new()
102    }
103}
104
105/// Table-level statistics
106#[derive(Debug, Clone)]
107pub struct TableStats {
108    /// Total number of rows
109    pub row_count: usize,
110    /// Average row size in bytes
111    pub avg_row_size: usize,
112    /// Total table size in bytes
113    pub total_size: usize,
114    /// Number of distinct values (for single-column tables)
115    pub ndv: usize,
116    /// Last update timestamp
117    pub last_updated: std::time::SystemTime,
118}
119
120impl TableStats {
121    /// Create new table statistics
122    pub fn new(row_count: usize, avg_row_size: usize) -> Self {
123        Self {
124            row_count,
125            avg_row_size,
126            total_size: row_count * avg_row_size,
127            ndv: row_count, // Conservative estimate
128            last_updated: std::time::SystemTime::now(),
129        }
130    }
131
132    /// Update row count
133    pub fn update_row_count(&mut self, row_count: usize) {
134        self.row_count = row_count;
135        self.total_size = row_count * self.avg_row_size;
136        self.last_updated = std::time::SystemTime::now();
137    }
138
139    /// Estimate scan cost (relative units)
140    pub fn estimate_scan_cost(&self) -> f64 {
141        self.row_count as f64 * 0.001 // Simplified cost model
142    }
143}
144
145/// Column-level statistics
146#[derive(Debug, Clone)]
147pub struct ColumnStats {
148    /// Number of distinct values
149    pub ndv: usize,
150    /// Number of null values
151    pub null_count: usize,
152    /// Minimum value (for ordered types)
153    pub min_value: Option<ColumnValue>,
154    /// Maximum value (for ordered types)
155    pub max_value: Option<ColumnValue>,
156    /// Histogram for distribution
157    pub histogram: Option<Histogram>,
158    /// Most common values and their frequencies
159    pub mcv: Vec<(ColumnValue, usize)>,
160}
161
162impl ColumnStats {
163    /// Create new column statistics
164    pub fn new(ndv: usize, null_count: usize) -> Self {
165        Self {
166            ndv,
167            null_count,
168            min_value: None,
169            max_value: None,
170            histogram: None,
171            mcv: Vec::new(),
172        }
173    }
174
175    /// Set min/max values
176    pub fn with_range(mut self, min: ColumnValue, max: ColumnValue) -> Self {
177        self.min_value = Some(min);
178        self.max_value = Some(max);
179        self
180    }
181
182    /// Set histogram
183    pub fn with_histogram(mut self, histogram: Histogram) -> Self {
184        self.histogram = Some(histogram);
185        self
186    }
187
188    /// Set most common values
189    pub fn with_mcv(mut self, mcv: Vec<(ColumnValue, usize)>) -> Self {
190        self.mcv = mcv;
191        self
192    }
193
194    /// Estimate selectivity for equality predicate
195    pub fn estimate_equality_selectivity(&self, value: &ColumnValue) -> f64 {
196        // Check if value is in MCV
197        for (mcv_val, freq) in &self.mcv {
198            if mcv_val == value {
199                return *freq as f64 / self.ndv as f64;
200            }
201        }
202
203        // Default: uniform distribution assumption
204        if self.ndv > 0 {
205            1.0 / self.ndv as f64
206        } else {
207            0.0
208        }
209    }
210
211    /// Estimate selectivity for range predicate
212    pub fn estimate_range_selectivity(&self, start: &ColumnValue, end: &ColumnValue) -> f64 {
213        if let Some(histogram) = &self.histogram {
214            histogram.estimate_range_selectivity(start, end)
215        } else {
216            0.33 // Default for range queries
217        }
218    }
219}
220
221/// Column value for statistics
222#[derive(Debug, Clone, PartialEq, PartialOrd)]
223pub enum ColumnValue {
224    Int64(i64),
225    Float64(f64),
226    String(String),
227    Boolean(bool),
228}
229
230/// Histogram for data distribution
231#[derive(Debug, Clone)]
232pub struct Histogram {
233    /// Histogram buckets
234    pub buckets: Vec<HistogramBucket>,
235    /// Total number of values
236    pub total_count: usize,
237}
238
239impl Histogram {
240    /// Create new histogram
241    pub fn new(buckets: Vec<HistogramBucket>, total_count: usize) -> Self {
242        Self {
243            buckets,
244            total_count,
245        }
246    }
247
248    /// Create equi-width histogram
249    pub fn equi_width(min: f64, max: f64, num_buckets: usize, values: &[f64]) -> Self {
250        let width = (max - min) / num_buckets as f64;
251        let mut buckets = Vec::with_capacity(num_buckets);
252
253        for i in 0..num_buckets {
254            let lower = min + i as f64 * width;
255            let upper = if i == num_buckets - 1 {
256                max
257            } else {
258                min + (i + 1) as f64 * width
259            };
260
261            let count = values.iter().filter(|&&v| v >= lower && v < upper).count();
262            // Estimate NDV by counting unique values (using BTreeSet to avoid Hash requirement)
263            let ndv = values
264                .iter()
265                .filter(|&&v| v >= lower && v < upper)
266                .map(|&v| ordered_float::OrderedFloat(v))
267                .collect::<std::collections::BTreeSet<_>>()
268                .len();
269
270            buckets.push(HistogramBucket {
271                lower_bound: ColumnValue::Float64(lower),
272                upper_bound: ColumnValue::Float64(upper),
273                count,
274                ndv,
275            });
276        }
277
278        Self {
279            buckets,
280            total_count: values.len(),
281        }
282    }
283
284    /// Estimate selectivity for range query
285    pub fn estimate_range_selectivity(&self, start: &ColumnValue, end: &ColumnValue) -> f64 {
286        if self.total_count == 0 {
287            return 0.0;
288        }
289
290        let mut matching_count = 0;
291        for bucket in &self.buckets {
292            if bucket.overlaps(start, end) {
293                matching_count += bucket.count;
294            }
295        }
296
297        matching_count as f64 / self.total_count as f64
298    }
299
300    /// Get number of buckets
301    pub fn num_buckets(&self) -> usize {
302        self.buckets.len()
303    }
304}
305
306/// Histogram bucket
307#[derive(Debug, Clone)]
308pub struct HistogramBucket {
309    /// Lower bound (inclusive)
310    pub lower_bound: ColumnValue,
311    /// Upper bound (exclusive, except for last bucket)
312    pub upper_bound: ColumnValue,
313    /// Number of values in bucket
314    pub count: usize,
315    /// Number of distinct values in bucket
316    pub ndv: usize,
317}
318
319impl HistogramBucket {
320    /// Check if bucket overlaps with range
321    pub fn overlaps(&self, start: &ColumnValue, end: &ColumnValue) -> bool {
322        // Simplified overlap check
323        self.lower_bound <= *end && self.upper_bound >= *start
324    }
325
326    /// Get bucket width (for numeric types)
327    pub fn width(&self) -> Option<f64> {
328        match (&self.lower_bound, &self.upper_bound) {
329            (ColumnValue::Float64(lower), ColumnValue::Float64(upper)) => Some(upper - lower),
330            (ColumnValue::Int64(lower), ColumnValue::Int64(upper)) => Some((upper - lower) as f64),
331            _ => None,
332        }
333    }
334}
335
336#[cfg(test)]
337mod tests {
338    use super::*;
339
340    #[test]
341    fn test_statistics_creation() {
342        let stats = Statistics::new();
343        assert!(stats.is_empty());
344    }
345
346    #[test]
347    fn test_table_stats() {
348        let stats = Statistics::new();
349        let table_stats = TableStats::new(1000, 128);
350
351        stats.update_table_stats("nodes".to_string(), table_stats.clone());
352
353        let retrieved = stats.get_table_stats("nodes");
354        assert!(retrieved.is_some());
355        assert_eq!(retrieved.unwrap().row_count, 1000);
356    }
357
358    #[test]
359    fn test_column_stats() {
360        let stats = Statistics::new();
361        let col_stats = ColumnStats::new(500, 10);
362
363        stats.update_column_stats("nodes.id".to_string(), col_stats);
364
365        let retrieved = stats.get_column_stats("nodes.id");
366        assert!(retrieved.is_some());
367        assert_eq!(retrieved.unwrap().ndv, 500);
368    }
369
370    #[test]
371    fn test_histogram_creation() {
372        let values = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
373        let histogram = Histogram::equi_width(1.0, 10.0, 5, &values);
374
375        assert_eq!(histogram.num_buckets(), 5);
376        assert_eq!(histogram.total_count, 10);
377    }
378
379    #[test]
380    fn test_selectivity_estimation() {
381        let stats = Statistics::new();
382        let table_stats = TableStats::new(1000, 128);
383
384        stats.update_table_stats("nodes".to_string(), table_stats);
385
386        let selectivity = stats.estimate_join_selectivity("nodes", "edges", "id");
387        assert!(selectivity > 0.0 && selectivity <= 1.0);
388    }
389
390    #[test]
391    fn test_filter_selectivity() {
392        let stats = Statistics::new();
393        let col_stats = ColumnStats::new(100, 5);
394
395        stats.update_column_stats("nodes.age".to_string(), col_stats);
396
397        let selectivity = stats.estimate_filter_selectivity("nodes.age", "=");
398        assert_eq!(selectivity, 0.01); // 1/100
399    }
400}