Skip to main content

spg_engine/
selectivity.rs

1// pedantic doc_markdown flags every bare ident in the comment-as-spec
2// block + several proper nouns; allowing at the module level keeps
3// the spec readable.
4#![allow(clippy::doc_markdown)]
5
6//! v6.2.2 — selectivity estimation over per-column statistics.
7//!
8//! Each selectivity function returns a fraction in `[0.0, 1.0]` —
9//! the planner multiplies these against `row_count` to get
10//! estimated input cardinality for each operator. v6.2.3 JOIN
11//! reorder consumes these estimates; v6.2.4 EXPLAIN ANALYZE
12//! surfaces them alongside the actual-rows count.
13//!
14//! Defaults follow PG's "no-stats" guesses so a freshly-loaded
15//! table without a prior ANALYZE still gets a plausible plan:
16//!
17//!   - `DEFAULT_EQ      = 0.005`  — PG's `DEFAULT_EQ_SEL`
18//!   - `DEFAULT_RANGE   = 0.333`  — PG's `DEFAULT_INEQ_SEL`
19//!   - `DEFAULT_BETWEEN = 0.005`  — narrower than range; matches PG
20//!     for `BETWEEN x AND y` without stats
21//!   - `DEFAULT_LIKE    = 0.005`  — PG's `DEFAULT_MATCH_SEL`
22//!
23//! Histogram walks use a binary-search-based "fraction ≤ value"
24//! primitive (`fraction_le_value`), giving us range estimation in
25//! `O(log n_buckets)` per call. Equality keys off `n_distinct`
26//! when the value lands inside the histogram range; out-of-range
27//! values get an extrapolation cap so OUT-OF-RANGE predicates
28//! don't collapse to zero (which would make the planner pick
29//! degenerate plans like cross-products).
30
31use alloc::string::ToString;
32
33use spg_storage::Value;
34
35use crate::statistics::ColumnStats;
36
37/// PG's default selectivity for `col = constant` when no histogram
38/// is available. v6.2.x can re-tune.
39pub const DEFAULT_EQ: f64 = 0.005;
40
41/// PG's default for `col <= / < / >= / > constant` without stats.
42pub const DEFAULT_RANGE: f64 = 0.333;
43
44/// PG's default for `col BETWEEN a AND b` without stats.
45pub const DEFAULT_BETWEEN: f64 = 0.005;
46
47/// PG's default for `col LIKE 'prefix%'` without stats.
48pub const DEFAULT_LIKE: f64 = 0.005;
49
50/// Floor for any selectivity to avoid degenerate zero estimates
51/// (PG uses 1.0e-7; we widen to 1.0e-6 for v6.2.x and re-tune as
52/// data shows up).
53const MIN_SELECTIVITY: f64 = 1.0e-6;
54
55/// `col = value`. With stats, returns `(1 / n_distinct) × (1 -
56/// null_frac)` when `value` lies in the histogram range, else
57/// scales down by an order of magnitude for out-of-range
58/// extrapolation. Without stats, returns [`DEFAULT_EQ`].
59pub fn equal(stats: Option<&ColumnStats>, value: &Value) -> f64 {
60    let Some(s) = stats else {
61        return DEFAULT_EQ;
62    };
63    if s.histogram_bounds.is_empty() || s.n_distinct == 0 {
64        return DEFAULT_EQ;
65    }
66    let base = (1.0 - f64::from(s.null_frac)) / s.n_distinct as f64;
67    let in_range = value_in_histogram_range(s, value);
68    if in_range {
69        base.max(MIN_SELECTIVITY).min(1.0)
70    } else {
71        // Out-of-range: still positive, but an order of magnitude
72        // lower than the in-range guess.
73        (base * 0.1).max(MIN_SELECTIVITY)
74    }
75}
76
77/// `col >= low AND col <= high` (with both bounds optional). When
78/// `low` is `None` the lower side is open at −∞; same for `high`
79/// and +∞. `lo_incl` / `hi_incl` control whether the boundary
80/// itself is included (currently a near-no-op since selectivity
81/// estimation is approximate at the boundary, but kept in the
82/// signature so the planner can pass the parser's intent through).
83pub fn range(
84    stats: Option<&ColumnStats>,
85    low: Option<&Value>,
86    high: Option<&Value>,
87    _lo_incl: bool,
88    _hi_incl: bool,
89) -> f64 {
90    let Some(s) = stats else {
91        return match (low, high) {
92            (Some(_), Some(_)) => DEFAULT_BETWEEN,
93            _ => DEFAULT_RANGE,
94        };
95    };
96    if s.histogram_bounds.is_empty() {
97        return match (low, high) {
98            (Some(_), Some(_)) => DEFAULT_BETWEEN,
99            _ => DEFAULT_RANGE,
100        };
101    }
102    let lo_frac = match low {
103        None => 0.0,
104        Some(v) => fraction_le_value(s, v),
105    };
106    let hi_frac = match high {
107        None => 1.0,
108        Some(v) => fraction_le_value(s, v),
109    };
110    let raw = (hi_frac - lo_frac).clamp(0.0, 1.0);
111    (raw * (1.0 - f64::from(s.null_frac))).max(MIN_SELECTIVITY)
112}
113
114/// `col BETWEEN low AND high` — convenience for the inclusive
115/// double-bounded shape. Equivalent to [`range`] with both
116/// bounds set and inclusive.
117pub fn between(stats: Option<&ColumnStats>, low: &Value, high: &Value) -> f64 {
118    range(stats, Some(low), Some(high), true, true)
119}
120
121/// `col IN (v1, v2, …)`. Sums per-value equality selectivities,
122/// clamped at 1.0. Without stats, returns `DEFAULT_EQ × len(values)`
123/// (also clamped) — the same shape PG would produce.
124pub fn in_list(stats: Option<&ColumnStats>, values: &[Value]) -> f64 {
125    if values.is_empty() {
126        // Empty IN list — selectivity 0 (matches no rows). Still
127        // floored at MIN_SELECTIVITY so the planner doesn't see
128        // a literal zero.
129        return MIN_SELECTIVITY;
130    }
131    let total: f64 = values.iter().map(|v| equal(stats, v)).sum();
132    total.clamp(MIN_SELECTIVITY, 1.0)
133}
134
135/// `col LIKE 'prefix%'` (or any single-prefix anchored pattern).
136/// With stats, estimates as `range(prefix, prefix + "\u{FFFF}")`
137/// on the assumption the column's natural ordering is a prefix
138/// order (Text lex). Without stats, [`DEFAULT_LIKE`].
139pub fn like_prefix(stats: Option<&ColumnStats>, prefix: &str) -> f64 {
140    let Some(s) = stats else {
141        return DEFAULT_LIKE;
142    };
143    if s.histogram_bounds.is_empty() {
144        return DEFAULT_LIKE;
145    }
146    // Synthesize "prefix\u{10FFFF}" as the upper bound — any
147    // string starting with the prefix sorts ≤ it. Avoids parsing
148    // the prefix as a typed Value since this only applies to TEXT
149    // columns in practice.
150    let low_str = prefix.to_string();
151    let mut high_str = prefix.to_string();
152    high_str.push('\u{10FFFF}');
153    let lo_frac = fraction_le_string(s, &low_str);
154    let hi_frac = fraction_le_string(s, &high_str);
155    let raw = (hi_frac - lo_frac).clamp(0.0, 1.0);
156    (raw * (1.0 - f64::from(s.null_frac))).max(MIN_SELECTIVITY)
157}
158
159// ── histogram walk primitives ───────────────────────────────────
160
161/// Return true when `value` lies between the histogram's first
162/// and last bound (inclusive).
163fn value_in_histogram_range(stats: &ColumnStats, value: &Value) -> bool {
164    let lo = match stats.histogram_bounds.first() {
165        Some(s) => s,
166        None => return false,
167    };
168    let hi = stats
169        .histogram_bounds
170        .last()
171        .expect("first present implies last present");
172    let cmp_lo = value_cmp_str(value, lo);
173    let cmp_hi = value_cmp_str(value, hi);
174    matches!(
175        cmp_lo,
176        core::cmp::Ordering::Equal | core::cmp::Ordering::Greater
177    ) && matches!(
178        cmp_hi,
179        core::cmp::Ordering::Equal | core::cmp::Ordering::Less
180    )
181}
182
183/// `fraction of rows with column-value ≤ value`. Performs binary
184/// search over the 101 histogram bounds; each bucket contains
185/// approximately `1 / num_buckets` of the rows.
186fn fraction_le_value(stats: &ColumnStats, value: &Value) -> f64 {
187    if stats.histogram_bounds.is_empty() {
188        return 0.5;
189    }
190    let n = stats.histogram_bounds.len();
191    let num_buckets = (n - 1).max(1) as f64;
192    let mut lo = 0usize;
193    let mut hi = n;
194    while lo < hi {
195        let mid = (lo + hi) / 2;
196        match value_cmp_str(value, &stats.histogram_bounds[mid]) {
197            core::cmp::Ordering::Less => hi = mid,
198            core::cmp::Ordering::Equal | core::cmp::Ordering::Greater => lo = mid + 1,
199        }
200    }
201    // `lo` = first bound strictly > value. Rows with column ≤
202    // value are the first `lo - 1` (zero-indexed) buckets +
203    // partial overlap with bucket `lo - 1`. For simplicity we
204    // treat it as `(lo - 1) / num_buckets` clamped to [0, 1].
205    let bound_idx = lo.saturating_sub(1);
206    (bound_idx as f64 / num_buckets).clamp(0.0, 1.0)
207}
208
209/// String-keyed version of `fraction_le_value`. Used by
210/// `like_prefix` which works on raw prefix strings instead of a
211/// typed Value.
212fn fraction_le_string(stats: &ColumnStats, key: &str) -> f64 {
213    if stats.histogram_bounds.is_empty() {
214        return 0.5;
215    }
216    let n = stats.histogram_bounds.len();
217    let num_buckets = (n - 1).max(1) as f64;
218    let mut lo = 0usize;
219    let mut hi = n;
220    while lo < hi {
221        let mid = (lo + hi) / 2;
222        if stats.histogram_bounds[mid].as_str() <= key {
223            lo = mid + 1;
224        } else {
225            hi = mid;
226        }
227    }
228    let bound_idx = lo.saturating_sub(1);
229    (bound_idx as f64 / num_buckets).clamp(0.0, 1.0)
230}
231
232/// v6.2.2 — Type-aware compare of a [`Value`] against a histogram
233/// bound (always a canonical-form `String`). Tries to interpret
234/// the bound in the value's expected numeric/date type first;
235/// falls back to the value's own string representation for total
236/// ordering. The fallback isn't strictly necessary for v6.2.2 —
237/// the planner only calls selectivity with values matching the
238/// column type — but keeps the function total.
239fn value_cmp_str(value: &Value, bound: &str) -> core::cmp::Ordering {
240    use core::cmp::Ordering;
241    match value {
242        Value::SmallInt(n) => bound
243            .parse::<i64>()
244            .map_or(Ordering::Equal, |b| i64::from(*n).cmp(&b)),
245        Value::Int(n) => bound
246            .parse::<i64>()
247            .map_or(Ordering::Equal, |b| i64::from(*n).cmp(&b)),
248        Value::BigInt(n) => bound.parse::<i64>().map_or(Ordering::Equal, |b| n.cmp(&b)),
249        Value::Float(x) => bound
250            .parse::<f64>()
251            .ok()
252            .and_then(|b| x.partial_cmp(&b))
253            .unwrap_or(Ordering::Equal),
254        Value::Text(s) | Value::Json(s) => s.as_str().cmp(bound),
255        Value::Bool(b) => {
256            let bs = if *b { "t" } else { "f" };
257            bs.cmp(bound)
258        }
259        // Date / Timestamp / Interval / Numeric / Vector* live in
260        // their canonical SQL form — bound is the same shape so a
261        // direct string compare on the canonical form is safe (ISO
262        // dates / timestamps sort correctly lexicographically).
263        Value::Date(_)
264        | Value::Timestamp(_)
265        | Value::Interval { .. }
266        | Value::Numeric { .. }
267        | Value::Vector(_)
268        | Value::Sq8Vector(_)
269        | Value::HalfVector(_) => {
270            // Best-effort: render the value and lex-compare. The
271            // selectivity numbers for these on a typed column are
272            // approximate anyway.
273            crate::canonical_value_repr(value).as_str().cmp(bound)
274        }
275        Value::Null => {
276            // NULL never participates in selectivity comparisons —
277            // the planner accounts for it via null_frac. Return
278            // Equal as a defensive no-op.
279            Ordering::Equal
280        }
281        // v7.5.0 — Value is #[non_exhaustive]; future variants fall
282        // back to canonical-form lex compare.
283        _ => crate::canonical_value_repr(value).as_str().cmp(bound),
284    }
285}
286
287// ── unit tests ──────────────────────────────────────────────────
288
289#[cfg(test)]
290mod tests {
291    use super::*;
292    use crate::statistics::ColumnStats;
293    use alloc::string::String;
294    use alloc::vec::Vec;
295
296    fn mk_int_stats(min: i64, max: i64, distinct: u64, nulls: f32) -> ColumnStats {
297        // Synthesise a histogram with 101 bounds linearly spaced
298        // between min and max. The exact spacing isn't critical
299        // for the unit cases; `range` cares about ordering.
300        let n = 100usize;
301        let mut bounds: Vec<String> = Vec::with_capacity(n + 1);
302        for i in 0..=n {
303            let v = min + (max - min) * i as i64 / n as i64;
304            bounds.push(alloc::format!("{v}"));
305        }
306        ColumnStats {
307            null_frac: nulls,
308            n_distinct: distinct,
309            histogram_bounds: bounds,
310        }
311    }
312
313    #[test]
314    fn no_stats_returns_pg_defaults() {
315        assert_eq!(equal(None, &Value::Int(1)), DEFAULT_EQ);
316        assert_eq!(
317            range(None, Some(&Value::Int(1)), None, true, true),
318            DEFAULT_RANGE
319        );
320        assert_eq!(
321            range(None, Some(&Value::Int(1)), Some(&Value::Int(2)), true, true),
322            DEFAULT_BETWEEN
323        );
324        assert_eq!(like_prefix(None, "abc"), DEFAULT_LIKE);
325    }
326
327    #[test]
328    fn equal_in_range_uses_n_distinct() {
329        let s = mk_int_stats(0, 1000, 1000, 0.0);
330        let est = equal(Some(&s), &Value::Int(500));
331        // 1 / 1000 = 0.001
332        assert!((est - 0.001).abs() < 1e-6, "got {est}");
333    }
334
335    #[test]
336    fn equal_out_of_range_extrapolates_down() {
337        let s = mk_int_stats(0, 1000, 1000, 0.0);
338        let est = equal(Some(&s), &Value::Int(5000));
339        // Out-of-range: (1 / 1000) × 0.1 = 0.0001
340        assert!(est < 0.001, "out-of-range must shrink, got {est}");
341        assert!(est >= MIN_SELECTIVITY);
342    }
343
344    #[test]
345    fn range_open_low_open_high_is_full() {
346        let s = mk_int_stats(0, 1000, 1000, 0.0);
347        let est = range(Some(&s), None, None, true, true);
348        // No bounds = full range × (1 - null_frac) = 1.0
349        assert!((est - 1.0).abs() < 1e-6);
350    }
351
352    #[test]
353    fn range_half_range_yields_about_half() {
354        let s = mk_int_stats(0, 1000, 1000, 0.0);
355        let est = range(Some(&s), None, Some(&Value::Int(500)), true, true);
356        assert!((0.4..=0.6).contains(&est), "got {est}");
357    }
358
359    #[test]
360    fn range_inverted_returns_min_selectivity() {
361        let s = mk_int_stats(0, 1000, 1000, 0.0);
362        // low > high → empty result; estimator returns MIN_SELECTIVITY
363        let est = range(
364            Some(&s),
365            Some(&Value::Int(900)),
366            Some(&Value::Int(100)),
367            true,
368            true,
369        );
370        assert!(est < 0.01);
371    }
372
373    #[test]
374    fn between_inclusive_subrange_matches_bucket_share() {
375        let s = mk_int_stats(0, 1000, 1000, 0.0);
376        let est = between(Some(&s), &Value::Int(100), &Value::Int(200));
377        // Approx 10% of the data range.
378        assert!((0.05..=0.15).contains(&est), "got {est}");
379    }
380
381    #[test]
382    fn in_list_sums_and_clamps() {
383        let s = mk_int_stats(0, 100, 100, 0.0);
384        let est = in_list(Some(&s), &[Value::Int(1), Value::Int(2), Value::Int(3)]);
385        // 3 × (1 / 100) = 0.03
386        assert!((est - 0.03).abs() < 1e-6);
387        // Empty list → MIN_SELECTIVITY, never 0.
388        assert!(in_list(Some(&s), &[]) >= MIN_SELECTIVITY);
389    }
390
391    #[test]
392    fn in_list_caps_at_one() {
393        let s = mk_int_stats(0, 5, 5, 0.0);
394        let many: Vec<Value> = (0..50).map(Value::Int).collect();
395        let est = in_list(Some(&s), &many);
396        assert!(est <= 1.0);
397    }
398
399    #[test]
400    fn like_prefix_estimates_range_share() {
401        // Build a TEXT histogram so prefix matching has somewhere
402        // to land. Values "a000" .. "z999" → 101 bounds.
403        let mut bounds = Vec::with_capacity(101);
404        let chars: Vec<char> = ('a'..='z').collect();
405        for i in 0..=100 {
406            let c = chars[i % chars.len()];
407            bounds.push(alloc::format!("{c}{i:03}"));
408        }
409        bounds.sort();
410        let s = ColumnStats {
411            null_frac: 0.0,
412            n_distinct: 1000,
413            histogram_bounds: bounds,
414        };
415        let est_a = like_prefix(Some(&s), "a");
416        let est_z = like_prefix(Some(&s), "z");
417        // Both prefixes should yield positive selectivity less
418        // than 1.
419        assert!((MIN_SELECTIVITY..=1.0).contains(&est_a));
420        assert!((MIN_SELECTIVITY..=1.0).contains(&est_z));
421    }
422
423    #[test]
424    fn null_frac_reduces_selectivity_proportionally() {
425        let s = mk_int_stats(0, 1000, 1000, 0.5);
426        let est = range(Some(&s), None, Some(&Value::Int(500)), true, true);
427        // Half-range × (1 - 0.5) ≈ 0.25
428        assert!((0.20..=0.30).contains(&est), "got {est}");
429    }
430}