#![allow(clippy::cast_precision_loss)]
mod plan_cost;
mod selectivity_method;
use crate::collection::query_cost::cost_model::OperationCostFactors;
use crate::collection::stats::next_after;
use crate::collection::stats::CollectionStats;
use crate::collection::stats::Histogram;
use crate::velesql::ast::{CompareOp, Condition, Value};
pub(super) const COMPAT_FILTER_IO: f64 = 0.2;
pub(super) const COMPAT_FILTER_CPU: f64 = 0.8;
pub(super) const COMPAT_HNSW_IO: f64 = 0.5;
pub(super) const COMPAT_HNSW_CPU: f64 = 1.0;
#[derive(Debug, Clone, Copy, Default, PartialEq)]
pub struct Cost {
pub io_cost: f64,
pub cpu_cost: f64,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum SelectivityMethod {
Histogram,
Cardinality,
Heuristic,
}
impl SelectivityMethod {
#[must_use]
pub const fn as_str(self) -> &'static str {
match self {
Self::Histogram => "histogram",
Self::Cardinality => "cardinality",
Self::Heuristic => "heuristic",
}
}
#[must_use]
pub const fn worst(self, other: Self) -> Self {
match (self, other) {
(Self::Heuristic, _) | (_, Self::Heuristic) => Self::Heuristic,
(Self::Cardinality, _) | (_, Self::Cardinality) => Self::Cardinality,
_ => Self::Histogram,
}
}
}
impl std::fmt::Display for SelectivityMethod {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.as_str())
}
}
impl Cost {
#[must_use]
pub const fn new(io_cost: f64, cpu_cost: f64) -> Self {
Self { io_cost, cpu_cost }
}
#[must_use]
pub const fn total(self) -> f64 {
self.io_cost + self.cpu_cost
}
}
#[derive(Debug)]
enum CostFactorsRef<'a> {
Calibrated(&'a OperationCostFactors),
Default,
}
impl CostFactorsRef<'_> {
fn get(&self) -> &OperationCostFactors {
match self {
Self::Calibrated(f) => f,
Self::Default => {
use std::sync::LazyLock;
static DEFAULT_FACTORS: LazyLock<OperationCostFactors> =
LazyLock::new(OperationCostFactors::default);
&DEFAULT_FACTORS
}
}
}
}
#[derive(Debug)]
pub struct CostEstimator<'a> {
pub(super) stats: &'a CollectionStats,
factors: CostFactorsRef<'a>,
}
pub(super) fn value_to_f64(value: &Value) -> Option<f64> {
match value {
Value::Integer(i) => Some(*i as f64),
Value::UnsignedInteger(u) => Some(*u as f64),
Value::Float(f) => Some(*f),
Value::Boolean(b) => Some(if *b { 1.0 } else { 0.0 }),
_ => None,
}
}
pub(super) fn default_factors() -> &'static OperationCostFactors {
use std::sync::LazyLock;
static DEFAULT: LazyLock<OperationCostFactors> = LazyLock::new(OperationCostFactors::default);
&DEFAULT
}
impl<'a> CostEstimator<'a> {
#[must_use]
pub fn new(stats: &'a CollectionStats) -> Self {
let factors = match &stats.calibrated_cost_factors {
Some(f) => CostFactorsRef::Calibrated(f),
None => CostFactorsRef::Default,
};
Self { stats, factors }
}
#[must_use]
pub fn with_factors(stats: &'a CollectionStats, factors: &'a OperationCostFactors) -> Self {
Self {
stats,
factors: CostFactorsRef::Calibrated(factors),
}
}
pub(super) fn factors(&self) -> &OperationCostFactors {
self.factors.get()
}
pub(super) fn get_histogram(&self, column: &str) -> Option<&Histogram> {
self.stats.get_column_histogram(column)
}
#[must_use]
pub fn estimate_filter_cost(&self, filter: &Condition) -> Cost {
let selectivity = self.estimate_condition_selectivity(filter).clamp(0.0, 1.0);
let total = self.stats.total_points.max(self.stats.row_count) as f64;
let scan_rows = (total * selectivity).max(1.0);
let f = self.factors.get();
let d = default_factors();
let io_ratio = f.seq_page_cost / d.seq_page_cost;
let cpu_ratio = f.cpu_tuple_cost / d.cpu_tuple_cost;
Cost::new(
scan_rows * COMPAT_FILTER_IO * io_ratio,
scan_rows * COMPAT_FILTER_CPU * cpu_ratio,
)
}
#[must_use]
pub fn estimate_hnsw_search_cost(&self, k: usize) -> Cost {
let total = self.stats.total_points.max(self.stats.row_count).max(1) as f64;
let probe = (k.max(1) as f64) * total.log2().max(1.0);
self.hnsw_cost_from_probe(probe)
}
#[must_use]
pub fn estimate_hnsw_search_cost_with_ef(&self, ef_search: u32, candidates: u32) -> Cost {
let total = self.stats.total_points.max(self.stats.row_count).max(1);
self.estimate_hnsw_search_cost_with_ef_on_size(ef_search, candidates, total)
}
#[must_use]
pub fn estimate_hnsw_search_cost_with_ef_on_size(
&self,
ef_search: u32,
candidates: u32,
collection_size: u64,
) -> Cost {
let total = collection_size.max(1) as f64;
let ef = f64::from(ef_search.max(1));
let k = f64::from(candidates.max(1));
let probe = (ef + k) * total.log2().max(1.0);
self.hnsw_cost_from_probe(probe)
}
pub(super) fn hnsw_cost_from_probe(&self, probe: f64) -> Cost {
let f = self.factors.get();
let d = default_factors();
let io_ratio = f.random_page_cost / d.random_page_cost;
let cpu_ratio = f.cpu_distance_cost / d.cpu_distance_cost;
Cost::new(
probe * COMPAT_HNSW_IO * io_ratio,
probe * COMPAT_HNSW_CPU * cpu_ratio,
)
}
#[must_use]
pub fn estimate_condition_selectivity(&self, condition: &Condition) -> f64 {
match condition {
Condition::Comparison(cmp) => self.estimate_comparison_selectivity_with_histogram(
&cmp.column,
cmp.operator,
&cmp.value,
),
Condition::In(cond) => {
self.estimate_in_selectivity(&cond.column, &cond.values, cond.negated)
}
Condition::Between(cond) => {
self.estimate_between_selectivity(&cond.column, &cond.low, &cond.high)
}
Condition::Like(cond) => self.estimate_like_selectivity(&cond.column, &cond.pattern),
Condition::IsNull(cond) => self
.stats
.field_stats
.get(cond.column.as_str())
.map_or(0.1, |s| {
s.null_count as f64 / self.stats.total_points.max(1) as f64
}),
Condition::Match(_) | Condition::Contains(_) | Condition::GeoDistance(_) => 0.1,
Condition::ContainsText(_) => 0.05,
Condition::GeoBbox(_) => 0.2,
Condition::GraphMatch(_) => 0.5,
Condition::And(left, right) => {
self.estimate_condition_selectivity(left)
* self.estimate_condition_selectivity(right)
}
Condition::Or(left, right) => {
let l = self.estimate_condition_selectivity(left);
let r = self.estimate_condition_selectivity(right);
(l + r - (l * r)).clamp(0.0, 1.0)
}
Condition::Not(inner) => 1.0 - self.estimate_condition_selectivity(inner),
Condition::Group(inner) => self.estimate_condition_selectivity(inner),
Condition::VectorSearch(_)
| Condition::VectorFusedSearch(_)
| Condition::SparseVectorSearch(_)
| Condition::Similarity(_) => 1.0,
}
}
pub(super) fn estimate_comparison_selectivity_with_histogram(
&self,
column: &str,
op: CompareOp,
value: &Value,
) -> f64 {
if matches!(value, Value::Parameter(_)) {
return 0.1;
}
let Some(v) = value_to_f64(value) else {
return self.stats.estimate_selectivity(column);
};
let Some(hist) = self.get_histogram(column) else {
return self.stats.estimate_selectivity(column);
};
let sel = match op {
CompareOp::Eq => hist.estimate_eq_selectivity(v),
CompareOp::NotEq => 1.0 - hist.estimate_eq_selectivity(v),
CompareOp::Lt => hist.estimate_lt_selectivity(v),
CompareOp::Lte => hist.estimate_lt_selectivity(next_after(v)),
CompareOp::Gt => 1.0 - hist.estimate_lt_selectivity(next_after(v)),
CompareOp::Gte => 1.0 - hist.estimate_lt_selectivity(v),
};
sel.clamp(0.0, 1.0)
}
pub(super) fn estimate_between_selectivity(
&self,
column: &str,
low: &Value,
high: &Value,
) -> f64 {
let (Some(low_f), Some(high_f)) = (value_to_f64(low), value_to_f64(high)) else {
return 0.3;
};
match self.get_histogram(column) {
Some(h) => h.estimate_range_selectivity(low_f, next_after(high_f)),
None => 0.3,
}
}
pub(super) fn estimate_in_selectivity(
&self,
column: &str,
values: &[Value],
negated: bool,
) -> f64 {
let sel = if let Some(h) = self.get_histogram(column) {
let numeric_sels: Vec<f64> = values
.iter()
.filter_map(value_to_f64)
.map(|v| h.estimate_eq_selectivity(v))
.collect();
if numeric_sels.is_empty() {
let base = self.stats.estimate_selectivity(column);
(base * values.len() as f64).clamp(0.0, 1.0)
} else {
let sum: f64 = numeric_sels.into_iter().sum();
sum.clamp(0.0, 1.0)
}
} else {
let base = self.stats.estimate_selectivity(column);
(base * values.len() as f64).clamp(0.0, 1.0)
};
if negated {
1.0 - sel
} else {
sel
}
}
#[must_use]
pub fn estimate_post_filter_topk_cost(&self, k: u32, ef_search: u32) -> Cost {
let n = f64::from(k.max(ef_search).max(1));
let f = self.factors.get();
let d = default_factors();
let cpu_ratio = f.cpu_tuple_cost / d.cpu_tuple_cost;
Cost::new(0.0, n * d.cpu_tuple_cost * cpu_ratio)
}
#[must_use]
pub fn estimate_filter_cost_from_selectivity(&self, selectivity: f64) -> Cost {
let sel = selectivity.clamp(0.0, 1.0);
let total = self.stats.total_points.max(self.stats.row_count) as f64;
let scan_rows = (total * sel).max(1.0);
let f = self.factors.get();
let d = default_factors();
let io_ratio = f.seq_page_cost / d.seq_page_cost;
let cpu_ratio = f.cpu_tuple_cost / d.cpu_tuple_cost;
Cost::new(
scan_rows * COMPAT_FILTER_IO * io_ratio,
scan_rows * COMPAT_FILTER_CPU * cpu_ratio,
)
}
pub(super) fn estimate_like_selectivity(&self, column: &str, pattern: &str) -> f64 {
let is_prefix = pattern.ends_with('%') && !pattern.starts_with('%');
if !is_prefix {
return 0.05;
}
let Some(_hist) = self.get_histogram(column) else {
return 0.1;
};
let distinct = self
.stats
.column_stats
.get(column)
.or_else(|| self.stats.field_stats.get(column))
.map_or(1, |cs| cs.distinct_count.max(1));
(1.0 / distinct as f64).clamp(0.01, 1.0)
}
}