Skip to main content

grafeo_core/statistics/
collector.rs

1//! Statistics collection and storage.
2//!
3//! This module provides structures for collecting and storing
4//! statistics about graph data for query optimization.
5
6use super::histogram::Histogram;
7use grafeo_common::types::Value;
8use std::collections::HashMap;
9
10/// A property key identifier.
11pub type PropertyKey = String;
12
13/// Statistics for the entire database.
14#[derive(Debug, Clone, Default)]
15pub struct Statistics {
16    /// Per-label statistics.
17    pub labels: HashMap<String, LabelStatistics>,
18    /// Per-edge-type statistics.
19    pub edge_types: HashMap<String, EdgeTypeStatistics>,
20    /// Per-property statistics (across all labels).
21    pub properties: HashMap<PropertyKey, ColumnStatistics>,
22    /// Total node count.
23    pub total_nodes: u64,
24    /// Total edge count.
25    pub total_edges: u64,
26}
27
28impl Statistics {
29    /// Creates a new empty statistics object.
30    pub fn new() -> Self {
31        Self::default()
32    }
33
34    /// Updates label statistics.
35    pub fn update_label(&mut self, label: &str, stats: LabelStatistics) {
36        self.labels.insert(label.to_string(), stats);
37    }
38
39    /// Updates edge type statistics.
40    pub fn update_edge_type(&mut self, edge_type: &str, stats: EdgeTypeStatistics) {
41        self.edge_types.insert(edge_type.to_string(), stats);
42    }
43
44    /// Updates property statistics.
45    pub fn update_property(&mut self, property: &str, stats: ColumnStatistics) {
46        self.properties.insert(property.to_string(), stats);
47    }
48
49    /// Gets label statistics.
50    pub fn get_label(&self, label: &str) -> Option<&LabelStatistics> {
51        self.labels.get(label)
52    }
53
54    /// Gets edge type statistics.
55    pub fn get_edge_type(&self, edge_type: &str) -> Option<&EdgeTypeStatistics> {
56        self.edge_types.get(edge_type)
57    }
58
59    /// Gets property statistics.
60    pub fn get_property(&self, property: &str) -> Option<&ColumnStatistics> {
61        self.properties.get(property)
62    }
63
64    /// Estimates the cardinality of a label scan.
65    pub fn estimate_label_cardinality(&self, label: &str) -> f64 {
66        self.labels
67            .get(label)
68            .map(|s| s.node_count as f64)
69            .unwrap_or(1000.0) // Default estimate if no statistics
70    }
71
72    /// Estimates the average degree for an edge type.
73    pub fn estimate_avg_degree(&self, edge_type: &str, outgoing: bool) -> f64 {
74        self.edge_types
75            .get(edge_type)
76            .map(|s| {
77                if outgoing {
78                    s.avg_out_degree
79                } else {
80                    s.avg_in_degree
81                }
82            })
83            .unwrap_or(10.0) // Default estimate
84    }
85
86    /// Estimates selectivity of an equality predicate.
87    pub fn estimate_equality_selectivity(&self, property: &str, _value: &Value) -> f64 {
88        self.properties
89            .get(property)
90            .map(|s| {
91                if s.distinct_count > 0 {
92                    1.0 / s.distinct_count as f64
93                } else {
94                    0.5
95                }
96            })
97            .unwrap_or(0.5)
98    }
99
100    /// Estimates selectivity of a range predicate.
101    pub fn estimate_range_selectivity(
102        &self,
103        property: &str,
104        lower: Option<&Value>,
105        upper: Option<&Value>,
106    ) -> f64 {
107        self.properties
108            .get(property)
109            .and_then(|s| s.histogram.as_ref())
110            .map(|h| h.estimate_range_selectivity(lower, upper, true, true))
111            .unwrap_or(0.33) // Default for range predicates
112    }
113}
114
115/// Statistics for a single label (node type).
116#[derive(Debug, Clone)]
117pub struct LabelStatistics {
118    /// Number of nodes with this label.
119    pub node_count: u64,
120    /// Average outgoing degree.
121    pub avg_out_degree: f64,
122    /// Average incoming degree.
123    pub avg_in_degree: f64,
124    /// Per-property statistics for nodes with this label.
125    pub properties: HashMap<PropertyKey, ColumnStatistics>,
126}
127
128impl LabelStatistics {
129    /// Creates new label statistics.
130    pub fn new(node_count: u64) -> Self {
131        Self {
132            node_count,
133            avg_out_degree: 0.0,
134            avg_in_degree: 0.0,
135            properties: HashMap::new(),
136        }
137    }
138
139    /// Sets the average degrees.
140    pub fn with_degrees(mut self, out_degree: f64, in_degree: f64) -> Self {
141        self.avg_out_degree = out_degree;
142        self.avg_in_degree = in_degree;
143        self
144    }
145
146    /// Adds property statistics.
147    pub fn with_property(mut self, property: &str, stats: ColumnStatistics) -> Self {
148        self.properties.insert(property.to_string(), stats);
149        self
150    }
151}
152
153/// Alias for table statistics (used in relational contexts).
154pub type TableStatistics = LabelStatistics;
155
156/// Statistics for an edge type.
157#[derive(Debug, Clone)]
158pub struct EdgeTypeStatistics {
159    /// Number of edges of this type.
160    pub edge_count: u64,
161    /// Average outgoing degree (edges per source node).
162    pub avg_out_degree: f64,
163    /// Average incoming degree (edges per target node).
164    pub avg_in_degree: f64,
165    /// Per-property statistics for edges of this type.
166    pub properties: HashMap<PropertyKey, ColumnStatistics>,
167}
168
169impl EdgeTypeStatistics {
170    /// Creates new edge type statistics.
171    pub fn new(edge_count: u64, avg_out_degree: f64, avg_in_degree: f64) -> Self {
172        Self {
173            edge_count,
174            avg_out_degree,
175            avg_in_degree,
176            properties: HashMap::new(),
177        }
178    }
179
180    /// Adds property statistics.
181    pub fn with_property(mut self, property: &str, stats: ColumnStatistics) -> Self {
182        self.properties.insert(property.to_string(), stats);
183        self
184    }
185}
186
187/// Statistics for a column/property.
188#[derive(Debug, Clone)]
189pub struct ColumnStatistics {
190    /// Number of distinct values.
191    pub distinct_count: u64,
192    /// Total number of values (including nulls).
193    pub total_count: u64,
194    /// Number of null values.
195    pub null_count: u64,
196    /// Minimum value (if applicable).
197    pub min_value: Option<Value>,
198    /// Maximum value (if applicable).
199    pub max_value: Option<Value>,
200    /// Average value (for numeric types).
201    pub avg_value: Option<f64>,
202    /// Equi-depth histogram (for selectivity estimation).
203    pub histogram: Option<Histogram>,
204    /// Most common values with their frequencies.
205    pub most_common: Vec<(Value, f64)>,
206}
207
208impl ColumnStatistics {
209    /// Creates new column statistics with basic info.
210    pub fn new(distinct_count: u64, total_count: u64, null_count: u64) -> Self {
211        Self {
212            distinct_count,
213            total_count,
214            null_count,
215            min_value: None,
216            max_value: None,
217            avg_value: None,
218            histogram: None,
219            most_common: Vec::new(),
220        }
221    }
222
223    /// Sets min/max values.
224    pub fn with_min_max(mut self, min: Value, max: Value) -> Self {
225        self.min_value = Some(min);
226        self.max_value = Some(max);
227        self
228    }
229
230    /// Sets the average value.
231    pub fn with_avg(mut self, avg: f64) -> Self {
232        self.avg_value = Some(avg);
233        self
234    }
235
236    /// Sets the histogram.
237    pub fn with_histogram(mut self, histogram: Histogram) -> Self {
238        self.histogram = Some(histogram);
239        self
240    }
241
242    /// Sets the most common values.
243    pub fn with_most_common(mut self, values: Vec<(Value, f64)>) -> Self {
244        self.most_common = values;
245        self
246    }
247
248    /// Returns the null fraction.
249    pub fn null_fraction(&self) -> f64 {
250        if self.total_count == 0 {
251            0.0
252        } else {
253            self.null_count as f64 / self.total_count as f64
254        }
255    }
256
257    /// Estimates selectivity for an equality predicate.
258    pub fn estimate_equality_selectivity(&self, value: &Value) -> f64 {
259        // Check most common values first
260        for (mcv, freq) in &self.most_common {
261            if mcv == value {
262                return *freq;
263            }
264        }
265
266        // Use histogram if available
267        if let Some(ref hist) = self.histogram {
268            return hist.estimate_equality_selectivity(value);
269        }
270
271        // Fall back to uniform distribution assumption
272        if self.distinct_count > 0 {
273            1.0 / self.distinct_count as f64
274        } else {
275            0.0
276        }
277    }
278
279    /// Estimates selectivity for a range predicate.
280    pub fn estimate_range_selectivity(&self, lower: Option<&Value>, upper: Option<&Value>) -> f64 {
281        if let Some(ref hist) = self.histogram {
282            return hist.estimate_range_selectivity(lower, upper, true, true);
283        }
284
285        // Without histogram, use min/max if available
286        match (&self.min_value, &self.max_value, lower, upper) {
287            (Some(min), Some(max), Some(l), Some(u)) => {
288                // Linear interpolation
289                estimate_linear_range(min, max, l, u)
290            }
291            (Some(_), Some(_), Some(_), None) => 0.5, // Greater than
292            (Some(_), Some(_), None, Some(_)) => 0.5, // Less than
293            _ => 0.33,                                // Default
294        }
295    }
296}
297
298/// Estimates range selectivity using linear interpolation.
299fn estimate_linear_range(min: &Value, max: &Value, lower: &Value, upper: &Value) -> f64 {
300    match (min, max, lower, upper) {
301        (
302            Value::Int64(min_v),
303            Value::Int64(max_v),
304            Value::Int64(lower_v),
305            Value::Int64(upper_v),
306        ) => {
307            let total_range = (max_v - min_v) as f64;
308            if total_range <= 0.0 {
309                return 1.0;
310            }
311
312            let effective_lower = (*lower_v).max(*min_v);
313            let effective_upper = (*upper_v).min(*max_v);
314
315            if effective_upper < effective_lower {
316                return 0.0;
317            }
318
319            (effective_upper - effective_lower) as f64 / total_range
320        }
321        (
322            Value::Float64(min_v),
323            Value::Float64(max_v),
324            Value::Float64(lower_v),
325            Value::Float64(upper_v),
326        ) => {
327            let total_range = max_v - min_v;
328            if total_range <= 0.0 {
329                return 1.0;
330            }
331
332            let effective_lower = lower_v.max(*min_v);
333            let effective_upper = upper_v.min(*max_v);
334
335            if effective_upper < effective_lower {
336                return 0.0;
337            }
338
339            (effective_upper - effective_lower) / total_range
340        }
341        _ => 0.33,
342    }
343}
344
345/// Builder for collecting statistics from data.
346#[allow(dead_code)]
347pub struct StatisticsCollector {
348    /// Values collected for histogram building.
349    values: Vec<Value>,
350    /// Distinct value tracker.
351    distinct: std::collections::HashSet<String>,
352    /// Running min.
353    min: Option<Value>,
354    /// Running max.
355    max: Option<Value>,
356    /// Running sum (for numeric).
357    sum: f64,
358    /// Null count.
359    null_count: u64,
360    /// Value frequency counter.
361    frequencies: HashMap<String, u64>,
362}
363
364#[allow(dead_code)]
365impl StatisticsCollector {
366    /// Creates a new statistics collector.
367    pub fn new() -> Self {
368        Self {
369            values: Vec::new(),
370            distinct: std::collections::HashSet::new(),
371            min: None,
372            max: None,
373            sum: 0.0,
374            null_count: 0,
375            frequencies: HashMap::new(),
376        }
377    }
378
379    /// Adds a value to the collector.
380    pub fn add(&mut self, value: Value) {
381        if matches!(value, Value::Null) {
382            self.null_count += 1;
383            return;
384        }
385
386        // Track distinct values
387        let key = format!("{value:?}");
388        self.distinct.insert(key.clone());
389
390        // Track frequencies
391        *self.frequencies.entry(key).or_insert(0) += 1;
392
393        // Track min/max
394        self.update_min_max(&value);
395
396        // Track sum for numeric
397        if let Some(v) = value_to_f64(&value) {
398            self.sum += v;
399        }
400
401        self.values.push(value);
402    }
403
404    fn update_min_max(&mut self, value: &Value) {
405        // Update min
406        match &self.min {
407            None => self.min = Some(value.clone()),
408            Some(current) => {
409                if compare_values(value, current) == Some(std::cmp::Ordering::Less) {
410                    self.min = Some(value.clone());
411                }
412            }
413        }
414
415        // Update max
416        match &self.max {
417            None => self.max = Some(value.clone()),
418            Some(current) => {
419                if compare_values(value, current) == Some(std::cmp::Ordering::Greater) {
420                    self.max = Some(value.clone());
421                }
422            }
423        }
424    }
425
426    /// Builds column statistics from collected data.
427    pub fn build(mut self, num_histogram_buckets: usize, num_mcv: usize) -> ColumnStatistics {
428        let total_count = self.values.len() as u64 + self.null_count;
429        let distinct_count = self.distinct.len() as u64;
430
431        let avg = if !self.values.is_empty() {
432            Some(self.sum / self.values.len() as f64)
433        } else {
434            None
435        };
436
437        // Build histogram
438        self.values
439            .sort_by(|a, b| compare_values(a, b).unwrap_or(std::cmp::Ordering::Equal));
440        let histogram = if self.values.len() >= num_histogram_buckets {
441            Some(Histogram::build(&self.values, num_histogram_buckets))
442        } else {
443            None
444        };
445
446        // Find most common values
447        let total_non_null = self.values.len() as f64;
448        let mut freq_vec: Vec<_> = self.frequencies.into_iter().collect();
449        freq_vec.sort_by(|a, b| b.1.cmp(&a.1));
450
451        let most_common: Vec<(Value, f64)> = freq_vec
452            .into_iter()
453            .take(num_mcv)
454            .filter_map(|(key, count)| {
455                // Try to parse the key back to a value (simplified)
456                let freq = count as f64 / total_non_null;
457                // This is a simplification - we'd need to store actual values
458                if key.starts_with("Int64(") {
459                    let num_str = key.trim_start_matches("Int64(").trim_end_matches(')');
460                    num_str.parse::<i64>().ok().map(|n| (Value::Int64(n), freq))
461                } else if key.starts_with("String(") {
462                    let s = key
463                        .trim_start_matches("String(Arc(\"")
464                        .trim_end_matches("\"))");
465                    Some((Value::String(s.to_string().into()), freq))
466                } else {
467                    None
468                }
469            })
470            .collect();
471
472        let mut stats = ColumnStatistics::new(distinct_count, total_count, self.null_count);
473
474        if let Some(min) = self.min {
475            if let Some(max) = self.max {
476                stats = stats.with_min_max(min, max);
477            }
478        }
479
480        if let Some(avg) = avg {
481            stats = stats.with_avg(avg);
482        }
483
484        if let Some(hist) = histogram {
485            stats = stats.with_histogram(hist);
486        }
487
488        if !most_common.is_empty() {
489            stats = stats.with_most_common(most_common);
490        }
491
492        stats
493    }
494}
495
496impl Default for StatisticsCollector {
497    fn default() -> Self {
498        Self::new()
499    }
500}
501
502/// Converts a value to f64.
503#[allow(dead_code)]
504fn value_to_f64(value: &Value) -> Option<f64> {
505    match value {
506        Value::Int64(i) => Some(*i as f64),
507        Value::Float64(f) => Some(*f),
508        _ => None,
509    }
510}
511
512/// Compares two values.
513#[allow(dead_code)]
514fn compare_values(a: &Value, b: &Value) -> Option<std::cmp::Ordering> {
515    match (a, b) {
516        (Value::Int64(a), Value::Int64(b)) => Some(a.cmp(b)),
517        (Value::Float64(a), Value::Float64(b)) => a.partial_cmp(b),
518        (Value::String(a), Value::String(b)) => Some(a.cmp(b)),
519        (Value::Bool(a), Value::Bool(b)) => Some(a.cmp(b)),
520        (Value::Int64(a), Value::Float64(b)) => (*a as f64).partial_cmp(b),
521        (Value::Float64(a), Value::Int64(b)) => a.partial_cmp(&(*b as f64)),
522        _ => None,
523    }
524}
525
526#[cfg(test)]
527mod tests {
528    use super::*;
529
530    #[test]
531    fn test_statistics_collector() {
532        let mut collector = StatisticsCollector::new();
533
534        for i in 0..100 {
535            collector.add(Value::Int64(i % 10)); // Values 0-9, each appearing 10 times
536        }
537
538        let stats = collector.build(10, 5);
539
540        assert_eq!(stats.distinct_count, 10);
541        assert_eq!(stats.total_count, 100);
542        assert_eq!(stats.null_count, 0);
543        assert_eq!(stats.min_value, Some(Value::Int64(0)));
544        assert_eq!(stats.max_value, Some(Value::Int64(9)));
545    }
546
547    #[test]
548    fn test_statistics_with_nulls() {
549        let mut collector = StatisticsCollector::new();
550
551        collector.add(Value::Int64(1));
552        collector.add(Value::Null);
553        collector.add(Value::Int64(2));
554        collector.add(Value::Null);
555        collector.add(Value::Int64(3));
556
557        let stats = collector.build(5, 3);
558
559        assert_eq!(stats.total_count, 5);
560        assert_eq!(stats.null_count, 2);
561        assert_eq!(stats.distinct_count, 3);
562        assert!((stats.null_fraction() - 0.4).abs() < 0.01);
563    }
564
565    #[test]
566    fn test_label_statistics() {
567        let stats = LabelStatistics::new(1000)
568            .with_degrees(5.0, 3.0)
569            .with_property(
570                "age",
571                ColumnStatistics::new(50, 1000, 10)
572                    .with_min_max(Value::Int64(0), Value::Int64(100)),
573            );
574
575        assert_eq!(stats.node_count, 1000);
576        assert_eq!(stats.avg_out_degree, 5.0);
577        assert!(stats.properties.contains_key("age"));
578    }
579
580    #[test]
581    fn test_database_statistics() {
582        let mut db_stats = Statistics::new();
583
584        db_stats.update_label(
585            "Person",
586            LabelStatistics::new(10000).with_degrees(10.0, 10.0),
587        );
588
589        db_stats.update_edge_type("KNOWS", EdgeTypeStatistics::new(50000, 5.0, 5.0));
590
591        assert_eq!(db_stats.estimate_label_cardinality("Person"), 10000.0);
592        assert_eq!(db_stats.estimate_label_cardinality("Unknown"), 1000.0); // Default
593
594        assert_eq!(db_stats.estimate_avg_degree("KNOWS", true), 5.0);
595    }
596}