Skip to main content

reddb_server/storage/index/
stats.rs

1//! Index statistics and kind enumeration surfaced by [`super::IndexBase`].
2//!
3//! Separated from the traits so the planner and diagnostics layers can depend
4//! on the types without pulling in the generic trait definitions.
5
6/// Enumeration of all index families RedDB understands. New structures add a
7/// variant here so the planner can match on them and pick cost models.
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
9pub enum IndexKind {
10    /// Placeholder used by `IndexStats::default()`.
11    #[default]
12    Unknown,
13    /// Ordered B-tree / B+ tree (range + point).
14    BTree,
15    /// Hash index (point-only, O(1)).
16    Hash,
17    /// Bitmap index (low-cardinality, set operations).
18    Bitmap,
19    /// Roaring bitmap compressed variant.
20    RoaringBitmap,
21    /// Inverted index for full-text / tokenised lookups.
22    Inverted,
23    /// HNSW approximate nearest-neighbour.
24    Hnsw,
25    /// IVF-Flat vector index.
26    IvfFlat,
27    /// Product-quantised vector index.
28    ProductQuantization,
29    /// Spatial index (R-tree / geo-hash).
30    Spatial,
31    /// Adjacency index for graph edges.
32    GraphAdjacency,
33    /// Temporal BTree for timeseries.
34    Temporal,
35    /// Cuckoo filter.
36    Cuckoo,
37    /// Zone map / min-max summary.
38    ZoneMap,
39    /// Cross-structure unified reference index.
40    UnifiedRef,
41    /// Count-min sketch heavy-hitters (top-k frequency estimate).
42    HeavyHitters,
43}
44
45impl IndexKind {
46    /// Does this kind support range queries out of the box?
47    pub fn supports_range(self) -> bool {
48        matches!(
49            self,
50            IndexKind::BTree | IndexKind::Spatial | IndexKind::Temporal | IndexKind::ZoneMap
51        )
52    }
53
54    /// Does this kind support approximate / similarity queries?
55    pub fn supports_ann(self) -> bool {
56        matches!(
57            self,
58            IndexKind::Hnsw | IndexKind::IvfFlat | IndexKind::ProductQuantization
59        )
60    }
61}
62
63/// Per-index statistics used by the cost-based planner and diagnostics.
64/// All fields are best-effort; zero means "unknown".
65#[derive(Debug, Clone, Default)]
66pub struct IndexStats {
67    /// Total number of `(key, value)` pairs stored.
68    pub entries: usize,
69    /// Number of distinct keys. For hash/btree this equals the key set size.
70    pub distinct_keys: usize,
71    /// Approximate memory footprint in bytes (0 if not tracked).
72    pub approx_bytes: usize,
73    /// Family of index this originates from.
74    pub kind: IndexKind,
75    /// Whether a bloom filter is attached (enables fast negative lookups).
76    pub has_bloom: bool,
77    /// Physical correlation between index key order and heap row order.
78    /// 1.0 = perfectly correlated (monotonic insert, timeseries) → sequential I/O.
79    /// 0.0 = completely random → worst-case random I/O per row.
80    /// Default 0.0 (conservative). Used by Mackert-Lohman I/O cost formula.
81    pub index_correlation: f64,
82}
83
84impl IndexStats {
85    /// Rough selectivity estimate for an equality predicate. Returns the
86    /// expected fraction of rows matching a random key, clamped to `[0, 1]`.
87    ///
88    /// Used by the planner to pick between index probes and full scans.
89    pub fn point_selectivity(&self) -> f64 {
90        if self.distinct_keys == 0 {
91            return 1.0;
92        }
93        (1.0 / self.distinct_keys as f64).clamp(0.0, 1.0)
94    }
95
96    /// Average number of values per distinct key.
97    pub fn avg_values_per_key(&self) -> f64 {
98        if self.distinct_keys == 0 {
99            return 0.0;
100        }
101        self.entries as f64 / self.distinct_keys as f64
102    }
103
104    /// Estimate the I/O cost (in arbitrary page-cost units) of fetching
105    /// `result_rows` rows via this index from a `heap_pages`-page table.
106    ///
107    /// Uses the Mackert-Lohman (1986) formula — the same model PostgreSQL
108    /// uses in `cost_index` (`optimizer/path/costsize.c:545-700`):
109    ///
110    /// ```text
111    /// pages_fetched = ML(selectivity, heap_pages)
112    /// io_cost = lerp(random_io, seq_io, correlation²)
113    /// ```
114    ///
115    /// Constants match PG GUC defaults:
116    /// - `random_page_cost = 4.0`
117    /// - `seq_page_cost    = 1.0`
118    pub fn correlated_io_cost(&self, result_rows: f64, heap_pages: f64) -> f64 {
119        const SEQ_PAGE_COST: f64 = 1.0;
120        const RANDOM_PAGE_COST: f64 = 4.0;
121
122        if heap_pages <= 0.0 || result_rows <= 0.0 {
123            return 0.0;
124        }
125
126        // Mackert-Lohman: expected distinct pages fetched when picking
127        // `result_rows` rows at random from a `heap_pages`-page file.
128        // Approximation: min(result_rows, heap_pages) * (1 - e^(-result_rows/heap_pages))
129        // This is the standard finite-population coupon-collector formula.
130        let frac = result_rows / heap_pages;
131        let pages_fetched = heap_pages * (1.0 - (-frac).exp());
132        let pages_fetched = pages_fetched.min(heap_pages);
133
134        // Random I/O: every page fetch is a seek
135        let random_cost = RANDOM_PAGE_COST * pages_fetched;
136
137        // Sequential I/O: rows arrive in heap order (correlation ≈ 1)
138        let seq_cost = SEQ_PAGE_COST * pages_fetched;
139
140        // Blend: correlation² weights sequential vs random
141        let corr2 = self.index_correlation.powi(2).clamp(0.0, 1.0);
142        seq_cost * corr2 + random_cost * (1.0 - corr2)
143    }
144}
145
146#[cfg(test)]
147mod tests {
148    use super::*;
149
150    #[test]
151    fn selectivity_scales_with_cardinality() {
152        let s = IndexStats {
153            entries: 100,
154            distinct_keys: 100,
155            approx_bytes: 0,
156            kind: IndexKind::Hash,
157            has_bloom: false,
158            index_correlation: 0.0,
159        };
160        assert_eq!(s.point_selectivity(), 0.01);
161        assert_eq!(s.avg_values_per_key(), 1.0);
162    }
163
164    #[test]
165    fn range_capability_flag() {
166        assert!(IndexKind::BTree.supports_range());
167        assert!(!IndexKind::Hash.supports_range());
168        assert!(IndexKind::Hnsw.supports_ann());
169    }
170
171    #[test]
172    fn empty_stats_do_not_divide_by_zero() {
173        let s = IndexStats::default();
174        assert_eq!(s.point_selectivity(), 1.0);
175        assert_eq!(s.avg_values_per_key(), 0.0);
176    }
177
178    #[test]
179    fn correlated_io_cheaper_than_random() {
180        let base = IndexStats {
181            entries: 10_000,
182            distinct_keys: 10_000,
183            approx_bytes: 0,
184            kind: IndexKind::BTree,
185            has_bloom: false,
186            index_correlation: 0.0,
187        };
188        let correlated = IndexStats {
189            index_correlation: 1.0,
190            ..base.clone()
191        };
192        let heap_pages = 1000.0;
193        let result_rows = 100.0;
194
195        let random_cost = base.correlated_io_cost(result_rows, heap_pages);
196        let seq_cost = correlated.correlated_io_cost(result_rows, heap_pages);
197        assert!(
198            seq_cost < random_cost,
199            "correlated (seq) should be cheaper than uncorrelated (random): {seq_cost} vs {random_cost}"
200        );
201    }
202
203    #[test]
204    fn timeseries_gets_full_correlation() {
205        // Timeseries index is set to correlation = 1.0 in temporal_index.rs
206        let s = IndexStats {
207            index_correlation: 1.0,
208            kind: IndexKind::Temporal,
209            ..IndexStats::default()
210        };
211        // With correlation = 1.0, cost = seq_cost × pages_fetched
212        // 0 result_rows → 0 cost
213        assert_eq!(s.correlated_io_cost(0.0, 1000.0), 0.0);
214    }
215}