1#![allow(clippy::doc_markdown)]
5
6use alloc::string::ToString;
32
33use spg_storage::Value;
34
35use crate::statistics::ColumnStats;
36
37pub const DEFAULT_EQ: f64 = 0.005;
40
41pub const DEFAULT_RANGE: f64 = 0.333;
43
44pub const DEFAULT_BETWEEN: f64 = 0.005;
46
47pub const DEFAULT_LIKE: f64 = 0.005;
49
50const MIN_SELECTIVITY: f64 = 1.0e-6;
54
55pub fn equal(stats: Option<&ColumnStats>, value: &Value) -> f64 {
60 let Some(s) = stats else {
61 return DEFAULT_EQ;
62 };
63 if s.histogram_bounds.is_empty() || s.n_distinct == 0 {
64 return DEFAULT_EQ;
65 }
66 let base = (1.0 - f64::from(s.null_frac)) / s.n_distinct as f64;
67 let in_range = value_in_histogram_range(s, value);
68 if in_range {
69 base.max(MIN_SELECTIVITY).min(1.0)
70 } else {
71 (base * 0.1).max(MIN_SELECTIVITY)
74 }
75}
76
77pub fn range(
84 stats: Option<&ColumnStats>,
85 low: Option<&Value>,
86 high: Option<&Value>,
87 _lo_incl: bool,
88 _hi_incl: bool,
89) -> f64 {
90 let Some(s) = stats else {
91 return match (low, high) {
92 (Some(_), Some(_)) => DEFAULT_BETWEEN,
93 _ => DEFAULT_RANGE,
94 };
95 };
96 if s.histogram_bounds.is_empty() {
97 return match (low, high) {
98 (Some(_), Some(_)) => DEFAULT_BETWEEN,
99 _ => DEFAULT_RANGE,
100 };
101 }
102 let lo_frac = match low {
103 None => 0.0,
104 Some(v) => fraction_le_value(s, v),
105 };
106 let hi_frac = match high {
107 None => 1.0,
108 Some(v) => fraction_le_value(s, v),
109 };
110 let raw = (hi_frac - lo_frac).clamp(0.0, 1.0);
111 (raw * (1.0 - f64::from(s.null_frac))).max(MIN_SELECTIVITY)
112}
113
114pub fn between(stats: Option<&ColumnStats>, low: &Value, high: &Value) -> f64 {
118 range(stats, Some(low), Some(high), true, true)
119}
120
121pub fn in_list(stats: Option<&ColumnStats>, values: &[Value]) -> f64 {
125 if values.is_empty() {
126 return MIN_SELECTIVITY;
130 }
131 let total: f64 = values.iter().map(|v| equal(stats, v)).sum();
132 total.clamp(MIN_SELECTIVITY, 1.0)
133}
134
135pub fn like_prefix(stats: Option<&ColumnStats>, prefix: &str) -> f64 {
140 let Some(s) = stats else {
141 return DEFAULT_LIKE;
142 };
143 if s.histogram_bounds.is_empty() {
144 return DEFAULT_LIKE;
145 }
146 let low_str = prefix.to_string();
151 let mut high_str = prefix.to_string();
152 high_str.push('\u{10FFFF}');
153 let lo_frac = fraction_le_string(s, &low_str);
154 let hi_frac = fraction_le_string(s, &high_str);
155 let raw = (hi_frac - lo_frac).clamp(0.0, 1.0);
156 (raw * (1.0 - f64::from(s.null_frac))).max(MIN_SELECTIVITY)
157}
158
159fn value_in_histogram_range(stats: &ColumnStats, value: &Value) -> bool {
164 let lo = match stats.histogram_bounds.first() {
165 Some(s) => s,
166 None => return false,
167 };
168 let hi = stats
169 .histogram_bounds
170 .last()
171 .expect("first present implies last present");
172 let cmp_lo = value_cmp_str(value, lo);
173 let cmp_hi = value_cmp_str(value, hi);
174 matches!(
175 cmp_lo,
176 core::cmp::Ordering::Equal | core::cmp::Ordering::Greater
177 ) && matches!(
178 cmp_hi,
179 core::cmp::Ordering::Equal | core::cmp::Ordering::Less
180 )
181}
182
183fn fraction_le_value(stats: &ColumnStats, value: &Value) -> f64 {
187 if stats.histogram_bounds.is_empty() {
188 return 0.5;
189 }
190 let n = stats.histogram_bounds.len();
191 let num_buckets = (n - 1).max(1) as f64;
192 let mut lo = 0usize;
193 let mut hi = n;
194 while lo < hi {
195 let mid = (lo + hi) / 2;
196 match value_cmp_str(value, &stats.histogram_bounds[mid]) {
197 core::cmp::Ordering::Less => hi = mid,
198 core::cmp::Ordering::Equal | core::cmp::Ordering::Greater => lo = mid + 1,
199 }
200 }
201 let bound_idx = lo.saturating_sub(1);
206 (bound_idx as f64 / num_buckets).clamp(0.0, 1.0)
207}
208
209fn fraction_le_string(stats: &ColumnStats, key: &str) -> f64 {
213 if stats.histogram_bounds.is_empty() {
214 return 0.5;
215 }
216 let n = stats.histogram_bounds.len();
217 let num_buckets = (n - 1).max(1) as f64;
218 let mut lo = 0usize;
219 let mut hi = n;
220 while lo < hi {
221 let mid = (lo + hi) / 2;
222 if stats.histogram_bounds[mid].as_str() <= key {
223 lo = mid + 1;
224 } else {
225 hi = mid;
226 }
227 }
228 let bound_idx = lo.saturating_sub(1);
229 (bound_idx as f64 / num_buckets).clamp(0.0, 1.0)
230}
231
232fn value_cmp_str(value: &Value, bound: &str) -> core::cmp::Ordering {
240 use core::cmp::Ordering;
241 match value {
242 Value::SmallInt(n) => bound
243 .parse::<i64>()
244 .map_or(Ordering::Equal, |b| i64::from(*n).cmp(&b)),
245 Value::Int(n) => bound
246 .parse::<i64>()
247 .map_or(Ordering::Equal, |b| i64::from(*n).cmp(&b)),
248 Value::BigInt(n) => bound.parse::<i64>().map_or(Ordering::Equal, |b| n.cmp(&b)),
249 Value::Float(x) => bound
250 .parse::<f64>()
251 .ok()
252 .and_then(|b| x.partial_cmp(&b))
253 .unwrap_or(Ordering::Equal),
254 Value::Text(s) | Value::Json(s) => s.as_str().cmp(bound),
255 Value::Bool(b) => {
256 let bs = if *b { "t" } else { "f" };
257 bs.cmp(bound)
258 }
259 Value::Date(_)
264 | Value::Timestamp(_)
265 | Value::Interval { .. }
266 | Value::Numeric { .. }
267 | Value::Vector(_)
268 | Value::Sq8Vector(_)
269 | Value::HalfVector(_) => {
270 crate::canonical_value_repr(value).as_str().cmp(bound)
274 }
275 Value::Null => {
276 Ordering::Equal
280 }
281 _ => crate::canonical_value_repr(value).as_str().cmp(bound),
284 }
285}
286
287#[cfg(test)]
290mod tests {
291 use super::*;
292 use crate::statistics::ColumnStats;
293 use alloc::string::String;
294 use alloc::vec::Vec;
295
296 fn mk_int_stats(min: i64, max: i64, distinct: u64, nulls: f32) -> ColumnStats {
297 let n = 100usize;
301 let mut bounds: Vec<String> = Vec::with_capacity(n + 1);
302 for i in 0..=n {
303 let v = min + (max - min) * i as i64 / n as i64;
304 bounds.push(alloc::format!("{v}"));
305 }
306 ColumnStats {
307 null_frac: nulls,
308 n_distinct: distinct,
309 histogram_bounds: bounds,
310 }
311 }
312
313 #[test]
314 fn no_stats_returns_pg_defaults() {
315 assert_eq!(equal(None, &Value::Int(1)), DEFAULT_EQ);
316 assert_eq!(
317 range(None, Some(&Value::Int(1)), None, true, true),
318 DEFAULT_RANGE
319 );
320 assert_eq!(
321 range(None, Some(&Value::Int(1)), Some(&Value::Int(2)), true, true),
322 DEFAULT_BETWEEN
323 );
324 assert_eq!(like_prefix(None, "abc"), DEFAULT_LIKE);
325 }
326
327 #[test]
328 fn equal_in_range_uses_n_distinct() {
329 let s = mk_int_stats(0, 1000, 1000, 0.0);
330 let est = equal(Some(&s), &Value::Int(500));
331 assert!((est - 0.001).abs() < 1e-6, "got {est}");
333 }
334
335 #[test]
336 fn equal_out_of_range_extrapolates_down() {
337 let s = mk_int_stats(0, 1000, 1000, 0.0);
338 let est = equal(Some(&s), &Value::Int(5000));
339 assert!(est < 0.001, "out-of-range must shrink, got {est}");
341 assert!(est >= MIN_SELECTIVITY);
342 }
343
344 #[test]
345 fn range_open_low_open_high_is_full() {
346 let s = mk_int_stats(0, 1000, 1000, 0.0);
347 let est = range(Some(&s), None, None, true, true);
348 assert!((est - 1.0).abs() < 1e-6);
350 }
351
352 #[test]
353 fn range_half_range_yields_about_half() {
354 let s = mk_int_stats(0, 1000, 1000, 0.0);
355 let est = range(Some(&s), None, Some(&Value::Int(500)), true, true);
356 assert!((0.4..=0.6).contains(&est), "got {est}");
357 }
358
359 #[test]
360 fn range_inverted_returns_min_selectivity() {
361 let s = mk_int_stats(0, 1000, 1000, 0.0);
362 let est = range(
364 Some(&s),
365 Some(&Value::Int(900)),
366 Some(&Value::Int(100)),
367 true,
368 true,
369 );
370 assert!(est < 0.01);
371 }
372
373 #[test]
374 fn between_inclusive_subrange_matches_bucket_share() {
375 let s = mk_int_stats(0, 1000, 1000, 0.0);
376 let est = between(Some(&s), &Value::Int(100), &Value::Int(200));
377 assert!((0.05..=0.15).contains(&est), "got {est}");
379 }
380
381 #[test]
382 fn in_list_sums_and_clamps() {
383 let s = mk_int_stats(0, 100, 100, 0.0);
384 let est = in_list(Some(&s), &[Value::Int(1), Value::Int(2), Value::Int(3)]);
385 assert!((est - 0.03).abs() < 1e-6);
387 assert!(in_list(Some(&s), &[]) >= MIN_SELECTIVITY);
389 }
390
391 #[test]
392 fn in_list_caps_at_one() {
393 let s = mk_int_stats(0, 5, 5, 0.0);
394 let many: Vec<Value> = (0..50).map(Value::Int).collect();
395 let est = in_list(Some(&s), &many);
396 assert!(est <= 1.0);
397 }
398
399 #[test]
400 fn like_prefix_estimates_range_share() {
401 let mut bounds = Vec::with_capacity(101);
404 let chars: Vec<char> = ('a'..='z').collect();
405 for i in 0..=100 {
406 let c = chars[i % chars.len()];
407 bounds.push(alloc::format!("{c}{i:03}"));
408 }
409 bounds.sort();
410 let s = ColumnStats {
411 null_frac: 0.0,
412 n_distinct: 1000,
413 histogram_bounds: bounds,
414 };
415 let est_a = like_prefix(Some(&s), "a");
416 let est_z = like_prefix(Some(&s), "z");
417 assert!((MIN_SELECTIVITY..=1.0).contains(&est_a));
420 assert!((MIN_SELECTIVITY..=1.0).contains(&est_z));
421 }
422
423 #[test]
424 fn null_frac_reduces_selectivity_proportionally() {
425 let s = mk_int_stats(0, 1000, 1000, 0.5);
426 let est = range(Some(&s), None, Some(&Value::Int(500)), true, true);
427 assert!((0.20..=0.30).contains(&est), "got {est}");
429 }
430}