Skip to main content

grafeo_core/statistics/
collector.rs

1//! Collecting and storing graph statistics.
2//!
3//! Use [`StatisticsCollector`] to stream values through and build statistics,
4//! or construct [`ColumnStatistics`] directly if you already know the numbers.
5//! The [`Statistics`] struct holds everything the optimizer needs.
6
7use super::histogram::Histogram;
8use grafeo_common::types::Value;
9use std::collections::HashMap;
10
11/// A property key identifier.
12pub type PropertyKey = String;
13
14/// Everything the optimizer knows about the data - cardinalities, distributions, degrees.
15///
16/// This is the main struct the query planner consults when choosing between
17/// different execution strategies.
18#[derive(Debug, Clone, Default)]
19pub struct Statistics {
20    /// Per-label statistics.
21    pub labels: HashMap<String, LabelStatistics>,
22    /// Per-edge-type statistics.
23    pub edge_types: HashMap<String, EdgeTypeStatistics>,
24    /// Per-property statistics (across all labels).
25    pub properties: HashMap<PropertyKey, ColumnStatistics>,
26    /// Total node count.
27    pub total_nodes: u64,
28    /// Total edge count.
29    pub total_edges: u64,
30}
31
32impl Statistics {
33    /// Creates a new empty statistics object.
34    pub fn new() -> Self {
35        Self::default()
36    }
37
38    /// Updates label statistics.
39    pub fn update_label(&mut self, label: &str, stats: LabelStatistics) {
40        self.labels.insert(label.to_string(), stats);
41    }
42
43    /// Updates edge type statistics.
44    pub fn update_edge_type(&mut self, edge_type: &str, stats: EdgeTypeStatistics) {
45        self.edge_types.insert(edge_type.to_string(), stats);
46    }
47
48    /// Updates property statistics.
49    pub fn update_property(&mut self, property: &str, stats: ColumnStatistics) {
50        self.properties.insert(property.to_string(), stats);
51    }
52
53    /// Gets label statistics.
54    pub fn get_label(&self, label: &str) -> Option<&LabelStatistics> {
55        self.labels.get(label)
56    }
57
58    /// Gets edge type statistics.
59    pub fn get_edge_type(&self, edge_type: &str) -> Option<&EdgeTypeStatistics> {
60        self.edge_types.get(edge_type)
61    }
62
63    /// Gets property statistics.
64    pub fn get_property(&self, property: &str) -> Option<&ColumnStatistics> {
65        self.properties.get(property)
66    }
67
68    /// Estimates the cardinality of a label scan.
69    pub fn estimate_label_cardinality(&self, label: &str) -> f64 {
70        self.labels
71            .get(label)
72            .map(|s| s.node_count as f64)
73            .unwrap_or(1000.0) // Default estimate if no statistics
74    }
75
76    /// Estimates the average degree for an edge type.
77    pub fn estimate_avg_degree(&self, edge_type: &str, outgoing: bool) -> f64 {
78        self.edge_types
79            .get(edge_type)
80            .map(|s| {
81                if outgoing {
82                    s.avg_out_degree
83                } else {
84                    s.avg_in_degree
85                }
86            })
87            .unwrap_or(10.0) // Default estimate
88    }
89
90    /// Estimates selectivity of an equality predicate.
91    pub fn estimate_equality_selectivity(&self, property: &str, _value: &Value) -> f64 {
92        self.properties
93            .get(property)
94            .map(|s| {
95                if s.distinct_count > 0 {
96                    1.0 / s.distinct_count as f64
97                } else {
98                    0.5
99                }
100            })
101            .unwrap_or(0.5)
102    }
103
104    /// Estimates selectivity of a range predicate.
105    pub fn estimate_range_selectivity(
106        &self,
107        property: &str,
108        lower: Option<&Value>,
109        upper: Option<&Value>,
110    ) -> f64 {
111        self.properties
112            .get(property)
113            .and_then(|s| s.histogram.as_ref())
114            .map(|h| h.estimate_range_selectivity(lower, upper, true, true))
115            .unwrap_or(0.33) // Default for range predicates
116    }
117}
118
119/// Statistics for nodes with a particular label (like "Person" or "Company").
120#[derive(Debug, Clone)]
121pub struct LabelStatistics {
122    /// Number of nodes with this label.
123    pub node_count: u64,
124    /// Average outgoing degree.
125    pub avg_out_degree: f64,
126    /// Average incoming degree.
127    pub avg_in_degree: f64,
128    /// Per-property statistics for nodes with this label.
129    pub properties: HashMap<PropertyKey, ColumnStatistics>,
130}
131
132impl LabelStatistics {
133    /// Creates new label statistics.
134    pub fn new(node_count: u64) -> Self {
135        Self {
136            node_count,
137            avg_out_degree: 0.0,
138            avg_in_degree: 0.0,
139            properties: HashMap::new(),
140        }
141    }
142
143    /// Sets the average degrees.
144    pub fn with_degrees(mut self, out_degree: f64, in_degree: f64) -> Self {
145        self.avg_out_degree = out_degree;
146        self.avg_in_degree = in_degree;
147        self
148    }
149
150    /// Adds property statistics.
151    pub fn with_property(mut self, property: &str, stats: ColumnStatistics) -> Self {
152        self.properties.insert(property.to_string(), stats);
153        self
154    }
155}
156
157/// Alias for table statistics (used in relational contexts).
158pub type TableStatistics = LabelStatistics;
159
160/// Statistics for edges of a particular type (like "KNOWS" or "WORKS_AT").
161#[derive(Debug, Clone)]
162pub struct EdgeTypeStatistics {
163    /// Number of edges of this type.
164    pub edge_count: u64,
165    /// Average outgoing degree (edges per source node).
166    pub avg_out_degree: f64,
167    /// Average incoming degree (edges per target node).
168    pub avg_in_degree: f64,
169    /// Per-property statistics for edges of this type.
170    pub properties: HashMap<PropertyKey, ColumnStatistics>,
171}
172
173impl EdgeTypeStatistics {
174    /// Creates new edge type statistics.
175    pub fn new(edge_count: u64, avg_out_degree: f64, avg_in_degree: f64) -> Self {
176        Self {
177            edge_count,
178            avg_out_degree,
179            avg_in_degree,
180            properties: HashMap::new(),
181        }
182    }
183
184    /// Adds property statistics.
185    pub fn with_property(mut self, property: &str, stats: ColumnStatistics) -> Self {
186        self.properties.insert(property.to_string(), stats);
187        self
188    }
189}
190
191/// Detailed statistics about a property's values - min, max, histogram, null ratio.
192#[derive(Debug, Clone)]
193pub struct ColumnStatistics {
194    /// Number of distinct values.
195    pub distinct_count: u64,
196    /// Total number of values (including nulls).
197    pub total_count: u64,
198    /// Number of null values.
199    pub null_count: u64,
200    /// Minimum value (if applicable).
201    pub min_value: Option<Value>,
202    /// Maximum value (if applicable).
203    pub max_value: Option<Value>,
204    /// Average value (for numeric types).
205    pub avg_value: Option<f64>,
206    /// Equi-depth histogram (for selectivity estimation).
207    pub histogram: Option<Histogram>,
208    /// Most common values with their frequencies.
209    pub most_common: Vec<(Value, f64)>,
210}
211
212impl ColumnStatistics {
213    /// Creates new column statistics with basic info.
214    pub fn new(distinct_count: u64, total_count: u64, null_count: u64) -> Self {
215        Self {
216            distinct_count,
217            total_count,
218            null_count,
219            min_value: None,
220            max_value: None,
221            avg_value: None,
222            histogram: None,
223            most_common: Vec::new(),
224        }
225    }
226
227    /// Sets min/max values.
228    pub fn with_min_max(mut self, min: Value, max: Value) -> Self {
229        self.min_value = Some(min);
230        self.max_value = Some(max);
231        self
232    }
233
234    /// Sets the average value.
235    pub fn with_avg(mut self, avg: f64) -> Self {
236        self.avg_value = Some(avg);
237        self
238    }
239
240    /// Sets the histogram.
241    pub fn with_histogram(mut self, histogram: Histogram) -> Self {
242        self.histogram = Some(histogram);
243        self
244    }
245
246    /// Sets the most common values.
247    pub fn with_most_common(mut self, values: Vec<(Value, f64)>) -> Self {
248        self.most_common = values;
249        self
250    }
251
252    /// Returns the null fraction.
253    pub fn null_fraction(&self) -> f64 {
254        if self.total_count == 0 {
255            0.0
256        } else {
257            self.null_count as f64 / self.total_count as f64
258        }
259    }
260
261    /// Estimates selectivity for an equality predicate.
262    pub fn estimate_equality_selectivity(&self, value: &Value) -> f64 {
263        // Check most common values first
264        for (mcv, freq) in &self.most_common {
265            if mcv == value {
266                return *freq;
267            }
268        }
269
270        // Use histogram if available
271        if let Some(ref hist) = self.histogram {
272            return hist.estimate_equality_selectivity(value);
273        }
274
275        // Fall back to uniform distribution assumption
276        if self.distinct_count > 0 {
277            1.0 / self.distinct_count as f64
278        } else {
279            0.0
280        }
281    }
282
283    /// Estimates selectivity for a range predicate.
284    pub fn estimate_range_selectivity(&self, lower: Option<&Value>, upper: Option<&Value>) -> f64 {
285        if let Some(ref hist) = self.histogram {
286            return hist.estimate_range_selectivity(lower, upper, true, true);
287        }
288
289        // Without histogram, use min/max if available
290        match (&self.min_value, &self.max_value, lower, upper) {
291            (Some(min), Some(max), Some(l), Some(u)) => {
292                // Linear interpolation
293                estimate_linear_range(min, max, l, u)
294            }
295            (Some(_), Some(_), Some(_), None) => 0.5, // Greater than
296            (Some(_), Some(_), None, Some(_)) => 0.5, // Less than
297            _ => 0.33,                                // Default
298        }
299    }
300}
301
302/// Estimates range selectivity using linear interpolation.
303fn estimate_linear_range(min: &Value, max: &Value, lower: &Value, upper: &Value) -> f64 {
304    match (min, max, lower, upper) {
305        (
306            Value::Int64(min_v),
307            Value::Int64(max_v),
308            Value::Int64(lower_v),
309            Value::Int64(upper_v),
310        ) => {
311            let total_range = (max_v - min_v) as f64;
312            if total_range <= 0.0 {
313                return 1.0;
314            }
315
316            let effective_lower = (*lower_v).max(*min_v);
317            let effective_upper = (*upper_v).min(*max_v);
318
319            if effective_upper < effective_lower {
320                return 0.0;
321            }
322
323            (effective_upper - effective_lower) as f64 / total_range
324        }
325        (
326            Value::Float64(min_v),
327            Value::Float64(max_v),
328            Value::Float64(lower_v),
329            Value::Float64(upper_v),
330        ) => {
331            let total_range = max_v - min_v;
332            if total_range <= 0.0 {
333                return 1.0;
334            }
335
336            let effective_lower = lower_v.max(*min_v);
337            let effective_upper = upper_v.min(*max_v);
338
339            if effective_upper < effective_lower {
340                return 0.0;
341            }
342
343            (effective_upper - effective_lower) / total_range
344        }
345        _ => 0.33,
346    }
347}
348
349/// Streams values through to build statistics automatically.
350///
351/// Call [`add()`](Self::add) for each value, then [`build()`](Self::build)
352/// to get the final [`ColumnStatistics`] with histogram and most common values.
353#[allow(dead_code)]
354pub struct StatisticsCollector {
355    /// Values collected for histogram building.
356    values: Vec<Value>,
357    /// Distinct value tracker.
358    distinct: std::collections::HashSet<String>,
359    /// Running min.
360    min: Option<Value>,
361    /// Running max.
362    max: Option<Value>,
363    /// Running sum (for numeric).
364    sum: f64,
365    /// Null count.
366    null_count: u64,
367    /// Value frequency counter.
368    frequencies: HashMap<String, u64>,
369}
370
371#[allow(dead_code)]
372impl StatisticsCollector {
373    /// Creates a new statistics collector.
374    pub fn new() -> Self {
375        Self {
376            values: Vec::new(),
377            distinct: std::collections::HashSet::new(),
378            min: None,
379            max: None,
380            sum: 0.0,
381            null_count: 0,
382            frequencies: HashMap::new(),
383        }
384    }
385
386    /// Adds a value to the collector.
387    pub fn add(&mut self, value: Value) {
388        if matches!(value, Value::Null) {
389            self.null_count += 1;
390            return;
391        }
392
393        // Track distinct values
394        let key = format!("{value:?}");
395        self.distinct.insert(key.clone());
396
397        // Track frequencies
398        *self.frequencies.entry(key).or_insert(0) += 1;
399
400        // Track min/max
401        self.update_min_max(&value);
402
403        // Track sum for numeric
404        if let Some(v) = value_to_f64(&value) {
405            self.sum += v;
406        }
407
408        self.values.push(value);
409    }
410
411    fn update_min_max(&mut self, value: &Value) {
412        // Update min
413        match &self.min {
414            None => self.min = Some(value.clone()),
415            Some(current) => {
416                if compare_values(value, current) == Some(std::cmp::Ordering::Less) {
417                    self.min = Some(value.clone());
418                }
419            }
420        }
421
422        // Update max
423        match &self.max {
424            None => self.max = Some(value.clone()),
425            Some(current) => {
426                if compare_values(value, current) == Some(std::cmp::Ordering::Greater) {
427                    self.max = Some(value.clone());
428                }
429            }
430        }
431    }
432
433    /// Builds column statistics from collected data.
434    pub fn build(mut self, num_histogram_buckets: usize, num_mcv: usize) -> ColumnStatistics {
435        let total_count = self.values.len() as u64 + self.null_count;
436        let distinct_count = self.distinct.len() as u64;
437
438        let avg = if !self.values.is_empty() {
439            Some(self.sum / self.values.len() as f64)
440        } else {
441            None
442        };
443
444        // Build histogram
445        self.values
446            .sort_by(|a, b| compare_values(a, b).unwrap_or(std::cmp::Ordering::Equal));
447        let histogram = if self.values.len() >= num_histogram_buckets {
448            Some(Histogram::build(&self.values, num_histogram_buckets))
449        } else {
450            None
451        };
452
453        // Find most common values
454        let total_non_null = self.values.len() as f64;
455        let mut freq_vec: Vec<_> = self.frequencies.into_iter().collect();
456        freq_vec.sort_by(|a, b| b.1.cmp(&a.1));
457
458        let most_common: Vec<(Value, f64)> = freq_vec
459            .into_iter()
460            .take(num_mcv)
461            .filter_map(|(key, count)| {
462                // Try to parse the key back to a value (simplified)
463                let freq = count as f64 / total_non_null;
464                // This is a simplification - we'd need to store actual values
465                if key.starts_with("Int64(") {
466                    let num_str = key.trim_start_matches("Int64(").trim_end_matches(')');
467                    num_str.parse::<i64>().ok().map(|n| (Value::Int64(n), freq))
468                } else if key.starts_with("String(") {
469                    let s = key
470                        .trim_start_matches("String(Arc(\"")
471                        .trim_end_matches("\"))");
472                    Some((Value::String(s.to_string().into()), freq))
473                } else {
474                    None
475                }
476            })
477            .collect();
478
479        let mut stats = ColumnStatistics::new(distinct_count, total_count, self.null_count);
480
481        if let Some(min) = self.min {
482            if let Some(max) = self.max {
483                stats = stats.with_min_max(min, max);
484            }
485        }
486
487        if let Some(avg) = avg {
488            stats = stats.with_avg(avg);
489        }
490
491        if let Some(hist) = histogram {
492            stats = stats.with_histogram(hist);
493        }
494
495        if !most_common.is_empty() {
496            stats = stats.with_most_common(most_common);
497        }
498
499        stats
500    }
501}
502
503impl Default for StatisticsCollector {
504    fn default() -> Self {
505        Self::new()
506    }
507}
508
509/// Converts a value to f64.
510#[allow(dead_code)]
511fn value_to_f64(value: &Value) -> Option<f64> {
512    match value {
513        Value::Int64(i) => Some(*i as f64),
514        Value::Float64(f) => Some(*f),
515        _ => None,
516    }
517}
518
519/// Compares two values.
520#[allow(dead_code)]
521fn compare_values(a: &Value, b: &Value) -> Option<std::cmp::Ordering> {
522    match (a, b) {
523        (Value::Int64(a), Value::Int64(b)) => Some(a.cmp(b)),
524        (Value::Float64(a), Value::Float64(b)) => a.partial_cmp(b),
525        (Value::String(a), Value::String(b)) => Some(a.cmp(b)),
526        (Value::Bool(a), Value::Bool(b)) => Some(a.cmp(b)),
527        (Value::Int64(a), Value::Float64(b)) => (*a as f64).partial_cmp(b),
528        (Value::Float64(a), Value::Int64(b)) => a.partial_cmp(&(*b as f64)),
529        _ => None,
530    }
531}
532
533#[cfg(test)]
534mod tests {
535    use super::*;
536
537    #[test]
538    fn test_statistics_collector() {
539        let mut collector = StatisticsCollector::new();
540
541        for i in 0..100 {
542            collector.add(Value::Int64(i % 10)); // Values 0-9, each appearing 10 times
543        }
544
545        let stats = collector.build(10, 5);
546
547        assert_eq!(stats.distinct_count, 10);
548        assert_eq!(stats.total_count, 100);
549        assert_eq!(stats.null_count, 0);
550        assert_eq!(stats.min_value, Some(Value::Int64(0)));
551        assert_eq!(stats.max_value, Some(Value::Int64(9)));
552    }
553
554    #[test]
555    fn test_statistics_with_nulls() {
556        let mut collector = StatisticsCollector::new();
557
558        collector.add(Value::Int64(1));
559        collector.add(Value::Null);
560        collector.add(Value::Int64(2));
561        collector.add(Value::Null);
562        collector.add(Value::Int64(3));
563
564        let stats = collector.build(5, 3);
565
566        assert_eq!(stats.total_count, 5);
567        assert_eq!(stats.null_count, 2);
568        assert_eq!(stats.distinct_count, 3);
569        assert!((stats.null_fraction() - 0.4).abs() < 0.01);
570    }
571
572    #[test]
573    fn test_label_statistics() {
574        let stats = LabelStatistics::new(1000)
575            .with_degrees(5.0, 3.0)
576            .with_property(
577                "age",
578                ColumnStatistics::new(50, 1000, 10)
579                    .with_min_max(Value::Int64(0), Value::Int64(100)),
580            );
581
582        assert_eq!(stats.node_count, 1000);
583        assert_eq!(stats.avg_out_degree, 5.0);
584        assert!(stats.properties.contains_key("age"));
585    }
586
587    #[test]
588    fn test_database_statistics() {
589        let mut db_stats = Statistics::new();
590
591        db_stats.update_label(
592            "Person",
593            LabelStatistics::new(10000).with_degrees(10.0, 10.0),
594        );
595
596        db_stats.update_edge_type("KNOWS", EdgeTypeStatistics::new(50000, 5.0, 5.0));
597
598        assert_eq!(db_stats.estimate_label_cardinality("Person"), 10000.0);
599        assert_eq!(db_stats.estimate_label_cardinality("Unknown"), 1000.0); // Default
600
601        assert_eq!(db_stats.estimate_avg_degree("KNOWS", true), 5.0);
602    }
603}