Skip to main content

xlog_stats/
stats.rs

1//! Core statistics types for GPU-resident relation metadata.
2//!
3//! This module provides statistics tracking for relations and columns that are
4//! used by the query optimizer and solver heuristics to make informed decisions
5//! about query execution strategies.
6
7use xlog_core::{RelId, ScalarType};
8
9/// GPU-resident relation statistics.
10///
11/// Tracks cardinality, memory usage, access patterns, and column-level statistics
12/// for relations stored on the GPU. These statistics drive optimizer cost models
13/// and solver heuristics for efficient query execution.
14#[derive(Debug, Clone)]
15pub struct RelationStats {
16    /// Unique identifier for the relation
17    pub rel_id: RelId,
18    /// Estimated number of rows in the relation
19    pub cardinality: u64,
20    /// Estimated total size in bytes on GPU
21    pub byte_size: u64,
22    /// Per-column statistics
23    pub column_stats: Vec<ColumnStats>,
24    /// Access heat for LRU-style eviction (exponential moving average)
25    pub heat: f32,
26    /// Unix timestamp of last access
27    pub last_access: u64,
28    /// Whether an index exists for this relation
29    pub has_index: bool,
30}
31
32impl RelationStats {
33    /// Creates new statistics for a relation with default (empty) values.
34    ///
35    /// # Arguments
36    /// * `rel_id` - The unique identifier for the relation
37    ///
38    /// # Returns
39    /// A new `RelationStats` instance with zero cardinality, no columns, and cold heat.
40    pub fn new(rel_id: RelId) -> Self {
41        Self {
42            rel_id,
43            cardinality: 0,
44            byte_size: 0,
45            column_stats: Vec::new(),
46            heat: 0.0,
47            last_access: 0,
48            has_index: false,
49        }
50    }
51
52    /// Updates the cardinality (row count) of the relation.
53    ///
54    /// This should be called after bulk loads, inserts, or when statistics
55    /// are refreshed from the actual GPU-resident data.
56    ///
57    /// # Arguments
58    /// * `rows` - The new cardinality estimate
59    pub fn update_cardinality(&mut self, rows: u64) {
60        self.cardinality = rows;
61    }
62
63    /// Updates the byte size estimate for the relation.
64    ///
65    /// # Arguments
66    /// * `bytes` - The estimated total size in bytes
67    pub fn update_byte_size(&mut self, bytes: u64) {
68        self.byte_size = bytes;
69    }
70
71    /// Records an access to this relation, updating heat and timestamp.
72    ///
73    /// Uses an exponential moving average for heat calculation:
74    /// `heat = heat * 0.9 + 0.1`
75    ///
76    /// This causes frequently accessed relations to maintain high heat
77    /// while infrequently accessed ones cool down over time.
78    pub fn record_access(&mut self) {
79        // Exponential moving average for heat
80        self.heat = self.heat * 0.9 + 0.1;
81        self.last_access = std::time::SystemTime::now()
82            .duration_since(std::time::UNIX_EPOCH)
83            .unwrap_or_default()
84            .as_secs();
85    }
86
87    /// Decays the heat by a multiplicative factor.
88    ///
89    /// This should be called periodically (e.g., during garbage collection
90    /// or memory pressure events) to allow unused relations to cool down.
91    ///
92    /// # Arguments
93    /// * `factor` - Multiplicative decay factor (typically 0.0 to 1.0)
94    pub fn decay_heat(&mut self, factor: f32) {
95        self.heat *= factor;
96    }
97
98    /// Adds column statistics for a new column.
99    ///
100    /// # Arguments
101    /// * `col_stats` - The column statistics to add
102    pub fn add_column(&mut self, col_stats: ColumnStats) {
103        self.column_stats.push(col_stats);
104    }
105
106    /// Gets column statistics by index.
107    ///
108    /// # Arguments
109    /// * `col_idx` - The column index
110    ///
111    /// # Returns
112    /// A reference to the column statistics if found
113    pub fn get_column(&self, col_idx: usize) -> Option<&ColumnStats> {
114        self.column_stats.iter().find(|c| c.col_idx == col_idx)
115    }
116
117    /// Gets mutable column statistics by index.
118    ///
119    /// # Arguments
120    /// * `col_idx` - The column index
121    ///
122    /// # Returns
123    /// A mutable reference to the column statistics if found
124    pub fn get_column_mut(&mut self, col_idx: usize) -> Option<&mut ColumnStats> {
125        self.column_stats.iter_mut().find(|c| c.col_idx == col_idx)
126    }
127
128    /// Estimates the selectivity for a given predicate cardinality.
129    ///
130    /// # Arguments
131    /// * `estimated_matches` - The estimated number of matching rows
132    ///
133    /// # Returns
134    /// The selectivity as a ratio (0.0 to 1.0)
135    pub fn estimate_selectivity(&self, estimated_matches: u64) -> f64 {
136        if self.cardinality == 0 {
137            return 1.0;
138        }
139        (estimated_matches as f64 / self.cardinality as f64).clamp(0.0, 1.0)
140    }
141}
142
143/// Per-column statistics for optimizer cost estimation.
144///
145/// Tracks null counts, distinct value estimates, and value ranges for columns.
146/// These statistics enable the optimizer to estimate filter selectivity and
147/// join cardinalities.
148#[derive(Debug, Clone)]
149pub struct ColumnStats {
150    /// Column index within the relation
151    pub col_idx: usize,
152    /// Data type of the column
153    pub dtype: ScalarType,
154    /// Count of null values (for nullable columns)
155    pub null_count: u64,
156    /// HyperLogLog-style distinct value estimate
157    pub distinct_estimate: u64,
158    /// Minimum value (for orderable types, stored as i64)
159    pub min_value: Option<i64>,
160    /// Maximum value (for orderable types, stored as i64)
161    pub max_value: Option<i64>,
162    /// Average value length for variable-length types (e.g., symbols)
163    pub avg_width: Option<f32>,
164}
165
166impl ColumnStats {
167    /// Creates new column statistics with default values.
168    ///
169    /// # Arguments
170    /// * `col_idx` - The column index within the relation
171    /// * `dtype` - The scalar type of the column
172    ///
173    /// # Returns
174    /// A new `ColumnStats` instance with zero counts and no range information.
175    pub fn new(col_idx: usize, dtype: ScalarType) -> Self {
176        Self {
177            col_idx,
178            dtype,
179            null_count: 0,
180            distinct_estimate: 0,
181            min_value: None,
182            max_value: None,
183            avg_width: None,
184        }
185    }
186
187    /// Updates the distinct value estimate.
188    ///
189    /// This should be updated from HyperLogLog or similar cardinality estimation
190    /// algorithms running on the GPU.
191    ///
192    /// # Arguments
193    /// * `estimate` - The new distinct value estimate
194    pub fn update_distinct(&mut self, estimate: u64) {
195        self.distinct_estimate = estimate;
196    }
197
198    /// Updates the value range for this column.
199    ///
200    /// # Arguments
201    /// * `min` - The minimum value (encoded as i64)
202    /// * `max` - The maximum value (encoded as i64)
203    pub fn update_range(&mut self, min: i64, max: i64) {
204        self.min_value = Some(min);
205        self.max_value = Some(max);
206    }
207
208    /// Updates the null count for this column.
209    ///
210    /// # Arguments
211    /// * `count` - The number of null values
212    pub fn update_null_count(&mut self, count: u64) {
213        self.null_count = count;
214    }
215
216    /// Updates the average width for variable-length columns.
217    ///
218    /// # Arguments
219    /// * `width` - The average value width in bytes
220    pub fn update_avg_width(&mut self, width: f32) {
221        self.avg_width = Some(width);
222    }
223
224    /// Estimates selectivity for an equality predicate.
225    ///
226    /// Uses the distinct value count to estimate selectivity. If no distinct
227    /// count is available, returns a default estimate.
228    ///
229    /// # Arguments
230    /// * `total_rows` - The total number of rows in the relation
231    ///
232    /// # Returns
233    /// The estimated selectivity (0.0 to 1.0)
234    pub fn equality_selectivity(&self, total_rows: u64) -> f64 {
235        if self.distinct_estimate == 0 || total_rows == 0 {
236            // Default selectivity when no statistics available
237            return 0.1;
238        }
239        1.0 / self.distinct_estimate as f64
240    }
241
242    /// Estimates selectivity for a range predicate.
243    ///
244    /// Uses min/max values to estimate what fraction of the range is covered.
245    /// Returns a default estimate if range statistics are unavailable.
246    ///
247    /// # Arguments
248    /// * `low` - The lower bound of the range (inclusive)
249    /// * `high` - The upper bound of the range (inclusive)
250    ///
251    /// # Returns
252    /// The estimated selectivity (0.0 to 1.0)
253    pub fn range_selectivity(&self, low: i64, high: i64) -> f64 {
254        match (self.min_value, self.max_value) {
255            (Some(col_min), Some(col_max)) if col_max > col_min => {
256                let col_range = (col_max - col_min) as f64;
257                let effective_low = low.max(col_min);
258                let effective_high = high.min(col_max);
259                if effective_high < effective_low {
260                    return 0.0;
261                }
262                let query_range = (effective_high - effective_low) as f64;
263                (query_range / col_range).clamp(0.0, 1.0)
264            }
265            _ => {
266                // Default range selectivity when no statistics available
267                0.25
268            }
269        }
270    }
271
272    /// Returns the storage size per value for this column type.
273    pub fn value_size_bytes(&self) -> usize {
274        self.dtype.size_bytes()
275    }
276}
277
278/// Join selectivity model for estimating join output cardinality.
279///
280/// Tracks information about joins between two relations, including the join
281/// keys and estimated selectivity. This is crucial for the optimizer to
282/// choose between nested-loop, hash, and sort-merge join strategies.
283#[derive(Debug, Clone)]
284pub struct JoinSelectivity {
285    /// Left relation in the join
286    pub left_rel: RelId,
287    /// Right relation in the join
288    pub right_rel: RelId,
289    /// Column indices used as join keys on the left relation
290    pub left_keys: Vec<usize>,
291    /// Column indices used as join keys on the right relation
292    pub right_keys: Vec<usize>,
293    /// Estimated selectivity factor (0.0 to 1.0)
294    pub selectivity: f64,
295    /// Whether this is a primary key to foreign key join
296    pub is_pk_fk: bool,
297    /// Cached join cardinality estimate (if computed)
298    cached_output_estimate: Option<u64>,
299}
300
301impl JoinSelectivity {
302    /// Creates a new join selectivity model between two relations.
303    ///
304    /// Initializes with default selectivity of 1.0 (cross product).
305    ///
306    /// # Arguments
307    /// * `left_rel` - The left relation's ID
308    /// * `right_rel` - The right relation's ID
309    ///
310    /// # Returns
311    /// A new `JoinSelectivity` with default values.
312    pub fn new(left_rel: RelId, right_rel: RelId) -> Self {
313        Self {
314            left_rel,
315            right_rel,
316            left_keys: Vec::new(),
317            right_keys: Vec::new(),
318            selectivity: 1.0,
319            is_pk_fk: false,
320            cached_output_estimate: None,
321        }
322    }
323
324    /// Sets the join keys for both relations.
325    ///
326    /// # Arguments
327    /// * `left_keys` - Column indices on the left relation
328    /// * `right_keys` - Column indices on the right relation
329    pub fn set_keys(&mut self, left_keys: Vec<usize>, right_keys: Vec<usize>) {
330        debug_assert_eq!(
331            left_keys.len(),
332            right_keys.len(),
333            "Join key counts must match"
334        );
335        self.left_keys = left_keys;
336        self.right_keys = right_keys;
337        self.cached_output_estimate = None;
338    }
339
340    /// Sets the selectivity factor.
341    ///
342    /// # Arguments
343    /// * `selectivity` - The selectivity factor (0.0 to 1.0)
344    pub fn set_selectivity(&mut self, selectivity: f64) {
345        self.selectivity = selectivity.clamp(0.0, 1.0);
346        self.cached_output_estimate = None;
347    }
348
349    /// Marks this as a primary key to foreign key join.
350    ///
351    /// PK-FK joins have special selectivity characteristics: the output
352    /// cardinality equals the FK side's cardinality.
353    pub fn mark_pk_fk(&mut self) {
354        self.is_pk_fk = true;
355    }
356
357    /// Estimates the output row count for this join.
358    ///
359    /// For PK-FK joins, returns the cardinality of the FK side.
360    /// For other joins, returns: left_rows * right_rows * selectivity
361    ///
362    /// # Arguments
363    /// * `left_rows` - Cardinality of the left relation
364    /// * `right_rows` - Cardinality of the right relation
365    ///
366    /// # Returns
367    /// The estimated output cardinality (minimum of 1)
368    pub fn estimate_output_rows(&self, left_rows: u64, right_rows: u64) -> u64 {
369        if self.is_pk_fk {
370            // FK side determines cardinality in PK-FK joins
371            // Conventionally, right side is FK
372            return right_rows;
373        }
374        ((left_rows as f64 * right_rows as f64 * self.selectivity) as u64).max(1)
375    }
376
377    /// Estimates selectivity from column statistics.
378    ///
379    /// Uses the "independence assumption" and distinct value counts:
380    /// selectivity = 1 / max(distinct_left, distinct_right)
381    ///
382    /// # Arguments
383    /// * `left_distinct` - Distinct value count for left join key
384    /// * `right_distinct` - Distinct value count for right join key
385    ///
386    /// # Returns
387    /// The estimated selectivity
388    pub fn estimate_selectivity_from_stats(left_distinct: u64, right_distinct: u64) -> f64 {
389        if left_distinct == 0 || right_distinct == 0 {
390            return 1.0;
391        }
392        1.0 / left_distinct.max(right_distinct) as f64
393    }
394
395    /// Updates selectivity based on observed join statistics.
396    ///
397    /// This can be called after query execution to improve future estimates.
398    ///
399    /// # Arguments
400    /// * `left_rows` - Actual left cardinality
401    /// * `right_rows` - Actual right cardinality
402    /// * `output_rows` - Actual output cardinality
403    pub fn update_from_observation(&mut self, left_rows: u64, right_rows: u64, output_rows: u64) {
404        let product = left_rows as f64 * right_rows as f64;
405        if product > 0.0 {
406            self.selectivity = (output_rows as f64 / product).clamp(0.0, 1.0);
407            self.cached_output_estimate = Some(output_rows);
408        }
409    }
410}
411
412#[cfg(test)]
413mod tests {
414    use super::*;
415
416    #[test]
417    fn test_relation_stats_new() {
418        let stats = RelationStats::new(RelId(1));
419        assert_eq!(stats.rel_id, RelId(1));
420        assert_eq!(stats.cardinality, 0);
421        assert_eq!(stats.heat, 0.0);
422        assert_eq!(stats.byte_size, 0);
423        assert!(stats.column_stats.is_empty());
424        assert!(!stats.has_index);
425    }
426
427    #[test]
428    fn test_relation_stats_update_cardinality() {
429        let mut stats = RelationStats::new(RelId(1));
430        stats.update_cardinality(1000);
431        assert_eq!(stats.cardinality, 1000);
432    }
433
434    #[test]
435    fn test_relation_stats_update_byte_size() {
436        let mut stats = RelationStats::new(RelId(1));
437        stats.update_byte_size(4096);
438        assert_eq!(stats.byte_size, 4096);
439    }
440
441    #[test]
442    fn test_relation_stats_update_heat() {
443        let mut stats = RelationStats::new(RelId(1));
444        assert_eq!(stats.heat, 0.0);
445
446        stats.record_access();
447        assert!(stats.heat > 0.0);
448        let heat_after_first = stats.heat;
449        assert!((heat_after_first - 0.1).abs() < 0.001);
450
451        stats.record_access();
452        assert!(stats.heat > heat_after_first);
453        // After second access: 0.1 * 0.9 + 0.1 = 0.19
454        assert!((stats.heat - 0.19).abs() < 0.001);
455
456        // Verify last_access was set
457        assert!(stats.last_access > 0);
458    }
459
460    #[test]
461    fn test_relation_stats_decay_heat() {
462        let mut stats = RelationStats::new(RelId(1));
463        stats.record_access();
464        stats.record_access();
465        let initial_heat = stats.heat;
466
467        stats.decay_heat(0.5);
468        assert!((stats.heat - initial_heat * 0.5).abs() < 0.001);
469    }
470
471    #[test]
472    fn test_relation_stats_column_management() {
473        let mut stats = RelationStats::new(RelId(1));
474        let col0 = ColumnStats::new(0, ScalarType::U32);
475        let col1 = ColumnStats::new(1, ScalarType::I64);
476
477        stats.add_column(col0);
478        stats.add_column(col1);
479
480        assert_eq!(stats.column_stats.len(), 2);
481        assert!(stats.get_column(0).is_some());
482        assert!(stats.get_column(1).is_some());
483        assert!(stats.get_column(2).is_none());
484
485        // Test mutable access
486        if let Some(col) = stats.get_column_mut(0) {
487            col.update_distinct(100);
488        }
489        assert_eq!(stats.get_column(0).unwrap().distinct_estimate, 100);
490    }
491
492    #[test]
493    fn test_relation_stats_estimate_selectivity() {
494        let mut stats = RelationStats::new(RelId(1));
495        stats.update_cardinality(1000);
496
497        // 100 matches out of 1000 = 0.1 selectivity
498        let sel = stats.estimate_selectivity(100);
499        assert!((sel - 0.1).abs() < 0.001);
500
501        // Edge case: zero cardinality
502        let empty_stats = RelationStats::new(RelId(2));
503        assert_eq!(empty_stats.estimate_selectivity(50), 1.0);
504    }
505
506    #[test]
507    fn test_column_stats_new() {
508        let col = ColumnStats::new(0, ScalarType::U32);
509        assert_eq!(col.col_idx, 0);
510        assert_eq!(col.dtype, ScalarType::U32);
511        assert_eq!(col.distinct_estimate, 0);
512        assert_eq!(col.null_count, 0);
513        assert!(col.min_value.is_none());
514        assert!(col.max_value.is_none());
515        assert!(col.avg_width.is_none());
516    }
517
518    #[test]
519    fn test_column_stats_update_distinct() {
520        let mut col = ColumnStats::new(0, ScalarType::U32);
521        col.update_distinct(500);
522        assert_eq!(col.distinct_estimate, 500);
523    }
524
525    #[test]
526    fn test_column_stats_update_range() {
527        let mut col = ColumnStats::new(0, ScalarType::I32);
528        col.update_range(-100, 100);
529        assert_eq!(col.min_value, Some(-100));
530        assert_eq!(col.max_value, Some(100));
531    }
532
533    #[test]
534    fn test_column_stats_update_null_count() {
535        let mut col = ColumnStats::new(0, ScalarType::U32);
536        col.update_null_count(42);
537        assert_eq!(col.null_count, 42);
538    }
539
540    #[test]
541    fn test_column_stats_update_avg_width() {
542        let mut col = ColumnStats::new(0, ScalarType::Symbol);
543        col.update_avg_width(12.5);
544        assert_eq!(col.avg_width, Some(12.5));
545    }
546
547    #[test]
548    fn test_column_stats_equality_selectivity() {
549        let mut col = ColumnStats::new(0, ScalarType::U32);
550        col.update_distinct(100);
551
552        let sel = col.equality_selectivity(1000);
553        assert!((sel - 0.01).abs() < 0.0001); // 1/100 = 0.01
554
555        // Edge case: no distinct estimate
556        let empty_col = ColumnStats::new(1, ScalarType::U32);
557        assert_eq!(empty_col.equality_selectivity(1000), 0.1); // default
558    }
559
560    #[test]
561    fn test_column_stats_range_selectivity() {
562        let mut col = ColumnStats::new(0, ScalarType::I64);
563        col.update_range(0, 100);
564
565        // Query for [25, 75] on column with range [0, 100]
566        let sel = col.range_selectivity(25, 75);
567        assert!((sel - 0.5).abs() < 0.001); // (75-25)/100 = 0.5
568
569        // Query outside range
570        let sel_outside = col.range_selectivity(200, 300);
571        assert_eq!(sel_outside, 0.0);
572
573        // Query partially overlapping
574        let sel_partial = col.range_selectivity(50, 150);
575        assert!((sel_partial - 0.5).abs() < 0.001); // (100-50)/100 = 0.5
576
577        // No range stats available
578        let empty_col = ColumnStats::new(1, ScalarType::I64);
579        assert_eq!(empty_col.range_selectivity(0, 100), 0.25); // default
580    }
581
582    #[test]
583    fn test_column_stats_value_size() {
584        assert_eq!(ColumnStats::new(0, ScalarType::U32).value_size_bytes(), 4);
585        assert_eq!(ColumnStats::new(0, ScalarType::U64).value_size_bytes(), 8);
586        assert_eq!(ColumnStats::new(0, ScalarType::Bool).value_size_bytes(), 1);
587    }
588
589    #[test]
590    fn test_join_selectivity_new() {
591        let js = JoinSelectivity::new(RelId(1), RelId(2));
592        assert_eq!(js.left_rel, RelId(1));
593        assert_eq!(js.right_rel, RelId(2));
594        assert!(js.left_keys.is_empty());
595        assert!(js.right_keys.is_empty());
596        assert_eq!(js.selectivity, 1.0);
597        assert!(!js.is_pk_fk);
598    }
599
600    #[test]
601    fn test_join_selectivity_set_keys() {
602        let mut js = JoinSelectivity::new(RelId(1), RelId(2));
603        js.set_keys(vec![0, 1], vec![0, 1]);
604        assert_eq!(js.left_keys, vec![0, 1]);
605        assert_eq!(js.right_keys, vec![0, 1]);
606    }
607
608    #[test]
609    fn test_join_selectivity_set_selectivity() {
610        let mut js = JoinSelectivity::new(RelId(1), RelId(2));
611        js.set_selectivity(0.01);
612        assert!((js.selectivity - 0.01).abs() < 0.0001);
613
614        // Test clamping
615        js.set_selectivity(2.0);
616        assert_eq!(js.selectivity, 1.0);
617
618        js.set_selectivity(-1.0);
619        assert_eq!(js.selectivity, 0.0);
620    }
621
622    #[test]
623    fn test_join_selectivity_estimate_output_rows() {
624        let mut js = JoinSelectivity::new(RelId(1), RelId(2));
625        js.set_selectivity(0.01);
626
627        // 1000 * 500 * 0.01 = 5000
628        let output = js.estimate_output_rows(1000, 500);
629        assert_eq!(output, 5000);
630
631        // Test minimum of 1
632        js.set_selectivity(0.0);
633        let output_min = js.estimate_output_rows(10, 10);
634        assert_eq!(output_min, 1);
635    }
636
637    #[test]
638    fn test_join_selectivity_pk_fk() {
639        let mut js = JoinSelectivity::new(RelId(1), RelId(2));
640        js.mark_pk_fk();
641        assert!(js.is_pk_fk);
642
643        // PK-FK join: output = FK side cardinality
644        let output = js.estimate_output_rows(100, 500);
645        assert_eq!(output, 500); // FK side (right) cardinality
646    }
647
648    #[test]
649    fn test_join_selectivity_estimate_from_stats() {
650        // Selectivity = 1 / max(100, 200) = 0.005
651        let sel = JoinSelectivity::estimate_selectivity_from_stats(100, 200);
652        assert!((sel - 0.005).abs() < 0.0001);
653
654        // Edge case: zero distinct
655        let sel_zero = JoinSelectivity::estimate_selectivity_from_stats(0, 100);
656        assert_eq!(sel_zero, 1.0);
657    }
658
659    #[test]
660    fn test_join_selectivity_update_from_observation() {
661        let mut js = JoinSelectivity::new(RelId(1), RelId(2));
662        js.update_from_observation(1000, 500, 2500);
663
664        // Observed selectivity = 2500 / (1000 * 500) = 0.005
665        assert!((js.selectivity - 0.005).abs() < 0.0001);
666    }
667
668    #[test]
669    fn test_all_scalar_types_column_stats() {
670        // Ensure we can create column stats for all scalar types
671        let types = [
672            ScalarType::U32,
673            ScalarType::U64,
674            ScalarType::I32,
675            ScalarType::I64,
676            ScalarType::F32,
677            ScalarType::F64,
678            ScalarType::Bool,
679            ScalarType::Symbol,
680        ];
681
682        for (idx, dtype) in types.iter().enumerate() {
683            let col = ColumnStats::new(idx, *dtype);
684            assert_eq!(col.col_idx, idx);
685            assert_eq!(col.dtype, *dtype);
686            assert!(col.value_size_bytes() > 0);
687        }
688    }
689}