#![allow(clippy::doc_markdown)]
use alloc::string::ToString;
use spg_storage::Value;
use crate::statistics::ColumnStats;
pub const DEFAULT_EQ: f64 = 0.005;
pub const DEFAULT_RANGE: f64 = 0.333;
pub const DEFAULT_BETWEEN: f64 = 0.005;
pub const DEFAULT_LIKE: f64 = 0.005;
const MIN_SELECTIVITY: f64 = 1.0e-6;
pub fn equal(stats: Option<&ColumnStats>, value: &Value) -> f64 {
let Some(s) = stats else {
return DEFAULT_EQ;
};
if s.histogram_bounds.is_empty() || s.n_distinct == 0 {
return DEFAULT_EQ;
}
let base = (1.0 - f64::from(s.null_frac)) / s.n_distinct as f64;
let in_range = value_in_histogram_range(s, value);
if in_range {
base.max(MIN_SELECTIVITY).min(1.0)
} else {
(base * 0.1).max(MIN_SELECTIVITY)
}
}
pub fn range(
stats: Option<&ColumnStats>,
low: Option<&Value>,
high: Option<&Value>,
_lo_incl: bool,
_hi_incl: bool,
) -> f64 {
let Some(s) = stats else {
return match (low, high) {
(Some(_), Some(_)) => DEFAULT_BETWEEN,
_ => DEFAULT_RANGE,
};
};
if s.histogram_bounds.is_empty() {
return match (low, high) {
(Some(_), Some(_)) => DEFAULT_BETWEEN,
_ => DEFAULT_RANGE,
};
}
let lo_frac = match low {
None => 0.0,
Some(v) => fraction_le_value(s, v),
};
let hi_frac = match high {
None => 1.0,
Some(v) => fraction_le_value(s, v),
};
let raw = (hi_frac - lo_frac).clamp(0.0, 1.0);
(raw * (1.0 - f64::from(s.null_frac))).max(MIN_SELECTIVITY)
}
pub fn between(stats: Option<&ColumnStats>, low: &Value, high: &Value) -> f64 {
range(stats, Some(low), Some(high), true, true)
}
pub fn in_list(stats: Option<&ColumnStats>, values: &[Value]) -> f64 {
if values.is_empty() {
return MIN_SELECTIVITY;
}
let total: f64 = values.iter().map(|v| equal(stats, v)).sum();
total.clamp(MIN_SELECTIVITY, 1.0)
}
pub fn like_prefix(stats: Option<&ColumnStats>, prefix: &str) -> f64 {
let Some(s) = stats else {
return DEFAULT_LIKE;
};
if s.histogram_bounds.is_empty() {
return DEFAULT_LIKE;
}
let low_str = prefix.to_string();
let mut high_str = prefix.to_string();
high_str.push('\u{10FFFF}');
let lo_frac = fraction_le_string(s, &low_str);
let hi_frac = fraction_le_string(s, &high_str);
let raw = (hi_frac - lo_frac).clamp(0.0, 1.0);
(raw * (1.0 - f64::from(s.null_frac))).max(MIN_SELECTIVITY)
}
fn value_in_histogram_range(stats: &ColumnStats, value: &Value) -> bool {
let lo = match stats.histogram_bounds.first() {
Some(s) => s,
None => return false,
};
let hi = stats
.histogram_bounds
.last()
.expect("first present implies last present");
let cmp_lo = value_cmp_str(value, lo);
let cmp_hi = value_cmp_str(value, hi);
matches!(
cmp_lo,
core::cmp::Ordering::Equal | core::cmp::Ordering::Greater
) && matches!(
cmp_hi,
core::cmp::Ordering::Equal | core::cmp::Ordering::Less
)
}
fn fraction_le_value(stats: &ColumnStats, value: &Value) -> f64 {
if stats.histogram_bounds.is_empty() {
return 0.5;
}
let n = stats.histogram_bounds.len();
let num_buckets = (n - 1).max(1) as f64;
let mut lo = 0usize;
let mut hi = n;
while lo < hi {
let mid = (lo + hi) / 2;
match value_cmp_str(value, &stats.histogram_bounds[mid]) {
core::cmp::Ordering::Less => hi = mid,
core::cmp::Ordering::Equal | core::cmp::Ordering::Greater => lo = mid + 1,
}
}
let bound_idx = lo.saturating_sub(1);
(bound_idx as f64 / num_buckets).clamp(0.0, 1.0)
}
fn fraction_le_string(stats: &ColumnStats, key: &str) -> f64 {
if stats.histogram_bounds.is_empty() {
return 0.5;
}
let n = stats.histogram_bounds.len();
let num_buckets = (n - 1).max(1) as f64;
let mut lo = 0usize;
let mut hi = n;
while lo < hi {
let mid = (lo + hi) / 2;
if stats.histogram_bounds[mid].as_str() <= key {
lo = mid + 1;
} else {
hi = mid;
}
}
let bound_idx = lo.saturating_sub(1);
(bound_idx as f64 / num_buckets).clamp(0.0, 1.0)
}
fn value_cmp_str(value: &Value, bound: &str) -> core::cmp::Ordering {
use core::cmp::Ordering;
match value {
Value::SmallInt(n) => bound
.parse::<i64>()
.map_or(Ordering::Equal, |b| i64::from(*n).cmp(&b)),
Value::Int(n) => bound
.parse::<i64>()
.map_or(Ordering::Equal, |b| i64::from(*n).cmp(&b)),
Value::BigInt(n) => bound.parse::<i64>().map_or(Ordering::Equal, |b| n.cmp(&b)),
Value::Float(x) => bound
.parse::<f64>()
.ok()
.and_then(|b| x.partial_cmp(&b))
.unwrap_or(Ordering::Equal),
Value::Text(s) | Value::Json(s) => s.as_str().cmp(bound),
Value::Bool(b) => {
let bs = if *b { "t" } else { "f" };
bs.cmp(bound)
}
Value::Date(_)
| Value::Timestamp(_)
| Value::Interval { .. }
| Value::Numeric { .. }
| Value::Vector(_)
| Value::Sq8Vector(_)
| Value::HalfVector(_) => {
crate::canonical_value_repr(value).as_str().cmp(bound)
}
Value::Null => {
Ordering::Equal
}
_ => crate::canonical_value_repr(value).as_str().cmp(bound),
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::statistics::ColumnStats;
use alloc::string::String;
use alloc::vec::Vec;
fn mk_int_stats(min: i64, max: i64, distinct: u64, nulls: f32) -> ColumnStats {
let n = 100usize;
let mut bounds: Vec<String> = Vec::with_capacity(n + 1);
for i in 0..=n {
let v = min + (max - min) * i as i64 / n as i64;
bounds.push(alloc::format!("{v}"));
}
ColumnStats {
null_frac: nulls,
n_distinct: distinct,
histogram_bounds: bounds,
}
}
#[test]
fn no_stats_returns_pg_defaults() {
assert_eq!(equal(None, &Value::Int(1)), DEFAULT_EQ);
assert_eq!(
range(None, Some(&Value::Int(1)), None, true, true),
DEFAULT_RANGE
);
assert_eq!(
range(None, Some(&Value::Int(1)), Some(&Value::Int(2)), true, true),
DEFAULT_BETWEEN
);
assert_eq!(like_prefix(None, "abc"), DEFAULT_LIKE);
}
#[test]
fn equal_in_range_uses_n_distinct() {
let s = mk_int_stats(0, 1000, 1000, 0.0);
let est = equal(Some(&s), &Value::Int(500));
assert!((est - 0.001).abs() < 1e-6, "got {est}");
}
#[test]
fn equal_out_of_range_extrapolates_down() {
let s = mk_int_stats(0, 1000, 1000, 0.0);
let est = equal(Some(&s), &Value::Int(5000));
assert!(est < 0.001, "out-of-range must shrink, got {est}");
assert!(est >= MIN_SELECTIVITY);
}
#[test]
fn range_open_low_open_high_is_full() {
let s = mk_int_stats(0, 1000, 1000, 0.0);
let est = range(Some(&s), None, None, true, true);
assert!((est - 1.0).abs() < 1e-6);
}
#[test]
fn range_half_range_yields_about_half() {
let s = mk_int_stats(0, 1000, 1000, 0.0);
let est = range(Some(&s), None, Some(&Value::Int(500)), true, true);
assert!((0.4..=0.6).contains(&est), "got {est}");
}
#[test]
fn range_inverted_returns_min_selectivity() {
let s = mk_int_stats(0, 1000, 1000, 0.0);
let est = range(
Some(&s),
Some(&Value::Int(900)),
Some(&Value::Int(100)),
true,
true,
);
assert!(est < 0.01);
}
#[test]
fn between_inclusive_subrange_matches_bucket_share() {
let s = mk_int_stats(0, 1000, 1000, 0.0);
let est = between(Some(&s), &Value::Int(100), &Value::Int(200));
assert!((0.05..=0.15).contains(&est), "got {est}");
}
#[test]
fn in_list_sums_and_clamps() {
let s = mk_int_stats(0, 100, 100, 0.0);
let est = in_list(Some(&s), &[Value::Int(1), Value::Int(2), Value::Int(3)]);
assert!((est - 0.03).abs() < 1e-6);
assert!(in_list(Some(&s), &[]) >= MIN_SELECTIVITY);
}
#[test]
fn in_list_caps_at_one() {
let s = mk_int_stats(0, 5, 5, 0.0);
let many: Vec<Value> = (0..50).map(Value::Int).collect();
let est = in_list(Some(&s), &many);
assert!(est <= 1.0);
}
#[test]
fn like_prefix_estimates_range_share() {
let mut bounds = Vec::with_capacity(101);
let chars: Vec<char> = ('a'..='z').collect();
for i in 0..=100 {
let c = chars[i % chars.len()];
bounds.push(alloc::format!("{c}{i:03}"));
}
bounds.sort();
let s = ColumnStats {
null_frac: 0.0,
n_distinct: 1000,
histogram_bounds: bounds,
};
let est_a = like_prefix(Some(&s), "a");
let est_z = like_prefix(Some(&s), "z");
assert!((MIN_SELECTIVITY..=1.0).contains(&est_a));
assert!((MIN_SELECTIVITY..=1.0).contains(&est_z));
}
#[test]
fn null_frac_reduces_selectivity_proportionally() {
let s = mk_int_stats(0, 1000, 1000, 0.5);
let est = range(Some(&s), None, Some(&Value::Int(500)), true, true);
assert!((0.20..=0.30).contains(&est), "got {est}");
}
}