1#![allow(clippy::doc_markdown)]
5
6use alloc::string::ToString;
32
33use spg_storage::Value;
34
35use crate::statistics::ColumnStats;
36
37pub const DEFAULT_EQ: f64 = 0.005;
40
41pub const DEFAULT_RANGE: f64 = 0.333;
43
44pub const DEFAULT_BETWEEN: f64 = 0.005;
46
47pub const DEFAULT_LIKE: f64 = 0.005;
49
50const MIN_SELECTIVITY: f64 = 1.0e-6;
54
55pub fn equal(stats: Option<&ColumnStats>, value: &Value) -> f64 {
60 let Some(s) = stats else {
61 return DEFAULT_EQ;
62 };
63 if s.histogram_bounds.is_empty() || s.n_distinct == 0 {
64 return DEFAULT_EQ;
65 }
66 let base = (1.0 - f64::from(s.null_frac)) / s.n_distinct as f64;
67 let in_range = value_in_histogram_range(s, value);
68 if in_range {
69 base.max(MIN_SELECTIVITY).min(1.0)
70 } else {
71 (base * 0.1).max(MIN_SELECTIVITY)
74 }
75}
76
77pub fn range(
84 stats: Option<&ColumnStats>,
85 low: Option<&Value>,
86 high: Option<&Value>,
87 _lo_incl: bool,
88 _hi_incl: bool,
89) -> f64 {
90 let Some(s) = stats else {
91 return match (low, high) {
92 (Some(_), Some(_)) => DEFAULT_BETWEEN,
93 _ => DEFAULT_RANGE,
94 };
95 };
96 if s.histogram_bounds.is_empty() {
97 return match (low, high) {
98 (Some(_), Some(_)) => DEFAULT_BETWEEN,
99 _ => DEFAULT_RANGE,
100 };
101 }
102 let lo_frac = match low {
103 None => 0.0,
104 Some(v) => fraction_le_value(s, v),
105 };
106 let hi_frac = match high {
107 None => 1.0,
108 Some(v) => fraction_le_value(s, v),
109 };
110 let raw = (hi_frac - lo_frac).clamp(0.0, 1.0);
111 (raw * (1.0 - f64::from(s.null_frac))).max(MIN_SELECTIVITY)
112}
113
114pub fn between(stats: Option<&ColumnStats>, low: &Value, high: &Value) -> f64 {
118 range(stats, Some(low), Some(high), true, true)
119}
120
121pub fn in_list(stats: Option<&ColumnStats>, values: &[Value]) -> f64 {
125 if values.is_empty() {
126 return MIN_SELECTIVITY;
130 }
131 let total: f64 = values.iter().map(|v| equal(stats, v)).sum();
132 total.clamp(MIN_SELECTIVITY, 1.0)
133}
134
135pub fn like_prefix(stats: Option<&ColumnStats>, prefix: &str) -> f64 {
140 let Some(s) = stats else {
141 return DEFAULT_LIKE;
142 };
143 if s.histogram_bounds.is_empty() {
144 return DEFAULT_LIKE;
145 }
146 let low_str = prefix.to_string();
151 let mut high_str = prefix.to_string();
152 high_str.push('\u{10FFFF}');
153 let lo_frac = fraction_le_string(s, &low_str);
154 let hi_frac = fraction_le_string(s, &high_str);
155 let raw = (hi_frac - lo_frac).clamp(0.0, 1.0);
156 (raw * (1.0 - f64::from(s.null_frac))).max(MIN_SELECTIVITY)
157}
158
159fn value_in_histogram_range(stats: &ColumnStats, value: &Value) -> bool {
164 let lo = match stats.histogram_bounds.first() {
165 Some(s) => s,
166 None => return false,
167 };
168 let hi = stats
169 .histogram_bounds
170 .last()
171 .expect("first present implies last present");
172 let cmp_lo = value_cmp_str(value, lo);
173 let cmp_hi = value_cmp_str(value, hi);
174 matches!(cmp_lo, core::cmp::Ordering::Equal | core::cmp::Ordering::Greater)
175 && matches!(cmp_hi, core::cmp::Ordering::Equal | core::cmp::Ordering::Less)
176}
177
178fn fraction_le_value(stats: &ColumnStats, value: &Value) -> f64 {
182 if stats.histogram_bounds.is_empty() {
183 return 0.5;
184 }
185 let n = stats.histogram_bounds.len();
186 let num_buckets = (n - 1).max(1) as f64;
187 let mut lo = 0usize;
188 let mut hi = n;
189 while lo < hi {
190 let mid = (lo + hi) / 2;
191 match value_cmp_str(value, &stats.histogram_bounds[mid]) {
192 core::cmp::Ordering::Less => hi = mid,
193 core::cmp::Ordering::Equal | core::cmp::Ordering::Greater => lo = mid + 1,
194 }
195 }
196 let bound_idx = lo.saturating_sub(1);
201 (bound_idx as f64 / num_buckets).clamp(0.0, 1.0)
202}
203
204fn fraction_le_string(stats: &ColumnStats, key: &str) -> f64 {
208 if stats.histogram_bounds.is_empty() {
209 return 0.5;
210 }
211 let n = stats.histogram_bounds.len();
212 let num_buckets = (n - 1).max(1) as f64;
213 let mut lo = 0usize;
214 let mut hi = n;
215 while lo < hi {
216 let mid = (lo + hi) / 2;
217 if stats.histogram_bounds[mid].as_str() <= key {
218 lo = mid + 1;
219 } else {
220 hi = mid;
221 }
222 }
223 let bound_idx = lo.saturating_sub(1);
224 (bound_idx as f64 / num_buckets).clamp(0.0, 1.0)
225}
226
227fn value_cmp_str(value: &Value, bound: &str) -> core::cmp::Ordering {
235 use core::cmp::Ordering;
236 match value {
237 Value::SmallInt(n) => bound
238 .parse::<i64>()
239 .map_or(Ordering::Equal, |b| i64::from(*n).cmp(&b)),
240 Value::Int(n) => bound
241 .parse::<i64>()
242 .map_or(Ordering::Equal, |b| i64::from(*n).cmp(&b)),
243 Value::BigInt(n) => bound
244 .parse::<i64>()
245 .map_or(Ordering::Equal, |b| n.cmp(&b)),
246 Value::Float(x) => bound
247 .parse::<f64>()
248 .ok()
249 .and_then(|b| x.partial_cmp(&b))
250 .unwrap_or(Ordering::Equal),
251 Value::Text(s) | Value::Json(s) => s.as_str().cmp(bound),
252 Value::Bool(b) => {
253 let bs = if *b { "t" } else { "f" };
254 bs.cmp(bound)
255 }
256 Value::Date(_)
261 | Value::Timestamp(_)
262 | Value::Interval { .. }
263 | Value::Numeric { .. }
264 | Value::Vector(_)
265 | Value::Sq8Vector(_)
266 | Value::HalfVector(_) => {
267 crate::canonical_value_repr(value).as_str().cmp(bound)
271 }
272 Value::Null => {
273 Ordering::Equal
277 }
278 _ => crate::canonical_value_repr(value).as_str().cmp(bound),
281 }
282}
283
284#[cfg(test)]
287mod tests {
288 use super::*;
289 use crate::statistics::ColumnStats;
290 use alloc::string::String;
291 use alloc::vec::Vec;
292
293 fn mk_int_stats(min: i64, max: i64, distinct: u64, nulls: f32) -> ColumnStats {
294 let n = 100usize;
298 let mut bounds: Vec<String> = Vec::with_capacity(n + 1);
299 for i in 0..=n {
300 let v = min + (max - min) * i as i64 / n as i64;
301 bounds.push(alloc::format!("{v}"));
302 }
303 ColumnStats {
304 null_frac: nulls,
305 n_distinct: distinct,
306 histogram_bounds: bounds,
307 }
308 }
309
310 #[test]
311 fn no_stats_returns_pg_defaults() {
312 assert_eq!(equal(None, &Value::Int(1)), DEFAULT_EQ);
313 assert_eq!(range(None, Some(&Value::Int(1)), None, true, true), DEFAULT_RANGE);
314 assert_eq!(
315 range(None, Some(&Value::Int(1)), Some(&Value::Int(2)), true, true),
316 DEFAULT_BETWEEN
317 );
318 assert_eq!(like_prefix(None, "abc"), DEFAULT_LIKE);
319 }
320
321 #[test]
322 fn equal_in_range_uses_n_distinct() {
323 let s = mk_int_stats(0, 1000, 1000, 0.0);
324 let est = equal(Some(&s), &Value::Int(500));
325 assert!((est - 0.001).abs() < 1e-6, "got {est}");
327 }
328
329 #[test]
330 fn equal_out_of_range_extrapolates_down() {
331 let s = mk_int_stats(0, 1000, 1000, 0.0);
332 let est = equal(Some(&s), &Value::Int(5000));
333 assert!(est < 0.001, "out-of-range must shrink, got {est}");
335 assert!(est >= MIN_SELECTIVITY);
336 }
337
338 #[test]
339 fn range_open_low_open_high_is_full() {
340 let s = mk_int_stats(0, 1000, 1000, 0.0);
341 let est = range(Some(&s), None, None, true, true);
342 assert!((est - 1.0).abs() < 1e-6);
344 }
345
346 #[test]
347 fn range_half_range_yields_about_half() {
348 let s = mk_int_stats(0, 1000, 1000, 0.0);
349 let est = range(Some(&s), None, Some(&Value::Int(500)), true, true);
350 assert!((0.4..=0.6).contains(&est), "got {est}");
351 }
352
353 #[test]
354 fn range_inverted_returns_min_selectivity() {
355 let s = mk_int_stats(0, 1000, 1000, 0.0);
356 let est = range(
358 Some(&s),
359 Some(&Value::Int(900)),
360 Some(&Value::Int(100)),
361 true,
362 true,
363 );
364 assert!(est < 0.01);
365 }
366
367 #[test]
368 fn between_inclusive_subrange_matches_bucket_share() {
369 let s = mk_int_stats(0, 1000, 1000, 0.0);
370 let est = between(Some(&s), &Value::Int(100), &Value::Int(200));
371 assert!((0.05..=0.15).contains(&est), "got {est}");
373 }
374
375 #[test]
376 fn in_list_sums_and_clamps() {
377 let s = mk_int_stats(0, 100, 100, 0.0);
378 let est = in_list(
379 Some(&s),
380 &[Value::Int(1), Value::Int(2), Value::Int(3)],
381 );
382 assert!((est - 0.03).abs() < 1e-6);
384 assert!(in_list(Some(&s), &[]) >= MIN_SELECTIVITY);
386 }
387
388 #[test]
389 fn in_list_caps_at_one() {
390 let s = mk_int_stats(0, 5, 5, 0.0);
391 let many: Vec<Value> = (0..50).map(Value::Int).collect();
392 let est = in_list(Some(&s), &many);
393 assert!(est <= 1.0);
394 }
395
396 #[test]
397 fn like_prefix_estimates_range_share() {
398 let mut bounds = Vec::with_capacity(101);
401 let chars: Vec<char> = ('a'..='z').collect();
402 for i in 0..=100 {
403 let c = chars[i % chars.len()];
404 bounds.push(alloc::format!("{c}{i:03}"));
405 }
406 bounds.sort();
407 let s = ColumnStats {
408 null_frac: 0.0,
409 n_distinct: 1000,
410 histogram_bounds: bounds,
411 };
412 let est_a = like_prefix(Some(&s), "a");
413 let est_z = like_prefix(Some(&s), "z");
414 assert!((MIN_SELECTIVITY..=1.0).contains(&est_a));
417 assert!((MIN_SELECTIVITY..=1.0).contains(&est_z));
418 }
419
420 #[test]
421 fn null_frac_reduces_selectivity_proportionally() {
422 let s = mk_int_stats(0, 1000, 1000, 0.5);
423 let est = range(Some(&s), None, Some(&Value::Int(500)), true, true);
424 assert!((0.20..=0.30).contains(&est), "got {est}");
426 }
427}