reddb_server/storage/index/stats.rs
1//! Index statistics and kind enumeration surfaced by [`super::IndexBase`].
2//!
3//! Separated from the traits so the planner and diagnostics layers can depend
4//! on the types without pulling in the generic trait definitions.
5
6/// Enumeration of all index families RedDB understands. New structures add a
7/// variant here so the planner can match on them and pick cost models.
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
9pub enum IndexKind {
10 /// Placeholder used by `IndexStats::default()`.
11 #[default]
12 Unknown,
13 /// Ordered B-tree / B+ tree (range + point).
14 BTree,
15 /// Hash index (point-only, O(1)).
16 Hash,
17 /// Bitmap index (low-cardinality, set operations).
18 Bitmap,
19 /// Roaring bitmap compressed variant.
20 RoaringBitmap,
21 /// Inverted index for full-text / tokenised lookups.
22 Inverted,
23 /// HNSW approximate nearest-neighbour.
24 Hnsw,
25 /// IVF-Flat vector index.
26 IvfFlat,
27 /// Product-quantised vector index.
28 ProductQuantization,
29 /// Spatial index (R-tree / geo-hash).
30 Spatial,
31 /// Adjacency index for graph edges.
32 GraphAdjacency,
33 /// Temporal BTree for timeseries.
34 Temporal,
35 /// Cuckoo filter.
36 Cuckoo,
37 /// Zone map / min-max summary.
38 ZoneMap,
39 /// Cross-structure unified reference index.
40 UnifiedRef,
41 /// Count-min sketch heavy-hitters (top-k frequency estimate).
42 HeavyHitters,
43}
44
45impl IndexKind {
46 /// Does this kind support range queries out of the box?
47 pub fn supports_range(self) -> bool {
48 matches!(
49 self,
50 IndexKind::BTree | IndexKind::Spatial | IndexKind::Temporal | IndexKind::ZoneMap
51 )
52 }
53
54 /// Does this kind support approximate / similarity queries?
55 pub fn supports_ann(self) -> bool {
56 matches!(
57 self,
58 IndexKind::Hnsw | IndexKind::IvfFlat | IndexKind::ProductQuantization
59 )
60 }
61}
62
63/// Per-index statistics used by the cost-based planner and diagnostics.
64/// All fields are best-effort; zero means "unknown".
65#[derive(Debug, Clone, Default)]
66pub struct IndexStats {
67 /// Total number of `(key, value)` pairs stored.
68 pub entries: usize,
69 /// Number of distinct keys. For hash/btree this equals the key set size.
70 pub distinct_keys: usize,
71 /// Approximate memory footprint in bytes (0 if not tracked).
72 pub approx_bytes: usize,
73 /// Family of index this originates from.
74 pub kind: IndexKind,
75 /// Whether a bloom filter is attached (enables fast negative lookups).
76 pub has_bloom: bool,
77 /// Physical correlation between index key order and heap row order.
78 /// 1.0 = perfectly correlated (monotonic insert, timeseries) → sequential I/O.
79 /// 0.0 = completely random → worst-case random I/O per row.
80 /// Default 0.0 (conservative). Used by Mackert-Lohman I/O cost formula.
81 pub index_correlation: f64,
82}
83
84impl IndexStats {
85 /// Rough selectivity estimate for an equality predicate. Returns the
86 /// expected fraction of rows matching a random key, clamped to `[0, 1]`.
87 ///
88 /// Used by the planner to pick between index probes and full scans.
89 pub fn point_selectivity(&self) -> f64 {
90 if self.distinct_keys == 0 {
91 return 1.0;
92 }
93 (1.0 / self.distinct_keys as f64).clamp(0.0, 1.0)
94 }
95
96 /// Average number of values per distinct key.
97 pub fn avg_values_per_key(&self) -> f64 {
98 if self.distinct_keys == 0 {
99 return 0.0;
100 }
101 self.entries as f64 / self.distinct_keys as f64
102 }
103
104 /// Estimate the I/O cost (in arbitrary page-cost units) of fetching
105 /// `result_rows` rows via this index from a `heap_pages`-page table.
106 ///
107 /// Uses the Mackert-Lohman (1986) formula — the same model PostgreSQL
108 /// uses in `cost_index` (`optimizer/path/costsize.c:545-700`):
109 ///
110 /// ```text
111 /// pages_fetched = ML(selectivity, heap_pages)
112 /// io_cost = lerp(random_io, seq_io, correlation²)
113 /// ```
114 ///
115 /// Constants match PG GUC defaults:
116 /// - `random_page_cost = 4.0`
117 /// - `seq_page_cost = 1.0`
118 pub fn correlated_io_cost(&self, result_rows: f64, heap_pages: f64) -> f64 {
119 const SEQ_PAGE_COST: f64 = 1.0;
120 const RANDOM_PAGE_COST: f64 = 4.0;
121
122 if heap_pages <= 0.0 || result_rows <= 0.0 {
123 return 0.0;
124 }
125
126 // Mackert-Lohman: expected distinct pages fetched when picking
127 // `result_rows` rows at random from a `heap_pages`-page file.
128 // Approximation: min(result_rows, heap_pages) * (1 - e^(-result_rows/heap_pages))
129 // This is the standard finite-population coupon-collector formula.
130 let frac = result_rows / heap_pages;
131 let pages_fetched = heap_pages * (1.0 - (-frac).exp());
132 let pages_fetched = pages_fetched.min(heap_pages);
133
134 // Random I/O: every page fetch is a seek
135 let random_cost = RANDOM_PAGE_COST * pages_fetched;
136
137 // Sequential I/O: rows arrive in heap order (correlation ≈ 1)
138 let seq_cost = SEQ_PAGE_COST * pages_fetched;
139
140 // Blend: correlation² weights sequential vs random
141 let corr2 = self.index_correlation.powi(2).clamp(0.0, 1.0);
142 seq_cost * corr2 + random_cost * (1.0 - corr2)
143 }
144}
145
146#[cfg(test)]
147mod tests {
148 use super::*;
149
150 #[test]
151 fn selectivity_scales_with_cardinality() {
152 let s = IndexStats {
153 entries: 100,
154 distinct_keys: 100,
155 approx_bytes: 0,
156 kind: IndexKind::Hash,
157 has_bloom: false,
158 index_correlation: 0.0,
159 };
160 assert_eq!(s.point_selectivity(), 0.01);
161 assert_eq!(s.avg_values_per_key(), 1.0);
162 }
163
164 #[test]
165 fn range_capability_flag() {
166 assert!(IndexKind::BTree.supports_range());
167 assert!(!IndexKind::Hash.supports_range());
168 assert!(IndexKind::Hnsw.supports_ann());
169 }
170
171 #[test]
172 fn empty_stats_do_not_divide_by_zero() {
173 let s = IndexStats::default();
174 assert_eq!(s.point_selectivity(), 1.0);
175 assert_eq!(s.avg_values_per_key(), 0.0);
176 }
177
178 #[test]
179 fn correlated_io_cheaper_than_random() {
180 let base = IndexStats {
181 entries: 10_000,
182 distinct_keys: 10_000,
183 approx_bytes: 0,
184 kind: IndexKind::BTree,
185 has_bloom: false,
186 index_correlation: 0.0,
187 };
188 let correlated = IndexStats {
189 index_correlation: 1.0,
190 ..base.clone()
191 };
192 let heap_pages = 1000.0;
193 let result_rows = 100.0;
194
195 let random_cost = base.correlated_io_cost(result_rows, heap_pages);
196 let seq_cost = correlated.correlated_io_cost(result_rows, heap_pages);
197 assert!(
198 seq_cost < random_cost,
199 "correlated (seq) should be cheaper than uncorrelated (random): {seq_cost} vs {random_cost}"
200 );
201 }
202
203 #[test]
204 fn timeseries_gets_full_correlation() {
205 // Timeseries index is set to correlation = 1.0 in temporal_index.rs
206 let s = IndexStats {
207 index_correlation: 1.0,
208 kind: IndexKind::Temporal,
209 ..IndexStats::default()
210 };
211 // With correlation = 1.0, cost = seq_cost × pages_fetched
212 // 0 result_rows → 0 cost
213 assert_eq!(s.correlated_io_cost(0.0, 1000.0), 0.0);
214 }
215}