use super::stats::TableStats;
use std::cmp::Ordering;
use std::fmt::Debug;
#[derive(Debug, Clone)]
pub enum FilterExpr {
Eq { column: String, value: FilterValue },
Ne { column: String, value: FilterValue },
Lt { column: String, value: FilterValue },
Le { column: String, value: FilterValue },
Gt { column: String, value: FilterValue },
Ge { column: String, value: FilterValue },
Between {
column: String,
lower: FilterValue,
upper: FilterValue,
},
In {
column: String,
values: Vec<FilterValue>,
},
Like { column: String, pattern: String },
IsNull { column: String },
IsNotNull { column: String },
And(Box<FilterExpr>, Box<FilterExpr>),
Or(Box<FilterExpr>, Box<FilterExpr>),
Not(Box<FilterExpr>),
Function { name: String, args: Vec<FilterExpr> },
Subquery { id: u64 },
True,
False,
}
#[derive(Debug, Clone, PartialEq)]
pub enum FilterValue {
Null,
Bool(bool),
Int(i64),
Float(f64),
String(String),
Bytes(Vec<u8>),
}
impl FilterValue {
pub fn as_f64(&self) -> Option<f64> {
match self {
FilterValue::Int(i) => Some(*i as f64),
FilterValue::Float(f) => Some(*f),
_ => None,
}
}
}
#[derive(Debug, Clone)]
pub struct CostModel {
pub base_compare: f64,
pub string_per_byte: f64,
pub function_call: f64,
pub subquery: f64,
pub pattern_match: f64,
pub null_check: f64,
pub in_per_element: f64,
}
impl Default for CostModel {
fn default() -> Self {
Self {
base_compare: 1.0,
string_per_byte: 0.01,
function_call: 10.0,
subquery: 1000.0,
pattern_match: 5.0,
null_check: 0.5,
in_per_element: 1.0,
}
}
}
#[derive(Debug, Clone)]
pub struct SelectivityModel {
pub default_equality: f64,
pub default_inequality: f64,
pub default_range: f64,
pub default_like: f64,
pub default_is_null: f64,
}
impl Default for SelectivityModel {
fn default() -> Self {
Self {
default_equality: 0.01, default_inequality: 0.99, default_range: 0.25, default_like: 0.10, default_is_null: 0.05, }
}
}
#[derive(Debug, Clone)]
pub struct RankingConfig {
pub cost_model: CostModel,
pub selectivity_model: SelectivityModel,
pub use_statistics: bool,
pub min_selectivity: f64,
pub max_cost: f64,
}
impl Default for RankingConfig {
fn default() -> Self {
Self {
cost_model: CostModel::default(),
selectivity_model: SelectivityModel::default(),
use_statistics: true,
min_selectivity: 0.0001,
max_cost: 10000.0,
}
}
}
#[derive(Debug, Clone)]
pub struct RankedFilter {
pub filter: FilterExpr,
pub selectivity: f64,
pub cost: f64,
pub score: f64,
pub original_index: usize,
}
impl RankedFilter {
pub fn new(filter: FilterExpr, selectivity: f64, cost: f64, original_index: usize) -> Self {
let score = if cost > 0.0 {
selectivity / cost
} else {
selectivity
};
Self {
filter,
selectivity,
cost,
score,
original_index,
}
}
}
impl PartialEq for RankedFilter {
fn eq(&self, other: &Self) -> bool {
self.score == other.score
}
}
impl Eq for RankedFilter {}
impl PartialOrd for RankedFilter {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for RankedFilter {
fn cmp(&self, other: &Self) -> Ordering {
self.score
.partial_cmp(&other.score)
.unwrap_or(Ordering::Equal)
}
}
pub struct FilterRanker {
config: RankingConfig,
table_stats: Option<TableStats>,
}
impl FilterRanker {
pub fn new(config: RankingConfig) -> Self {
Self {
config,
table_stats: None,
}
}
pub fn with_defaults() -> Self {
Self::new(RankingConfig::default())
}
pub fn with_stats(mut self, stats: TableStats) -> Self {
self.table_stats = Some(stats);
self
}
pub fn estimate_selectivity(&self, filter: &FilterExpr) -> f64 {
let sel = match filter {
FilterExpr::Eq { column, value } => self.estimate_equality_selectivity(column, value),
FilterExpr::Ne { column, value } => {
1.0 - self.estimate_equality_selectivity(column, value)
}
FilterExpr::Lt { column, value } | FilterExpr::Le { column, value } => {
self.estimate_range_selectivity(column, None, Some(value))
}
FilterExpr::Gt { column, value } | FilterExpr::Ge { column, value } => {
self.estimate_range_selectivity(column, Some(value), None)
}
FilterExpr::Between {
column,
lower,
upper,
} => self.estimate_range_selectivity(column, Some(lower), Some(upper)),
FilterExpr::In { column, values } => self.estimate_in_selectivity(column, values),
FilterExpr::Like { column, pattern } => self.estimate_like_selectivity(column, pattern),
FilterExpr::IsNull { column } => self.estimate_null_selectivity(column, true),
FilterExpr::IsNotNull { column } => self.estimate_null_selectivity(column, false),
FilterExpr::And(left, right) => {
self.estimate_selectivity(left) * self.estimate_selectivity(right)
}
FilterExpr::Or(left, right) => {
let sel_a = self.estimate_selectivity(left);
let sel_b = self.estimate_selectivity(right);
sel_a + sel_b - (sel_a * sel_b)
}
FilterExpr::Not(inner) => 1.0 - self.estimate_selectivity(inner),
FilterExpr::Function { .. } => self.config.selectivity_model.default_equality,
FilterExpr::Subquery { .. } => self.config.selectivity_model.default_equality,
FilterExpr::True => 1.0,
FilterExpr::False => 0.0,
};
sel.max(self.config.min_selectivity)
}
pub fn estimate_cost(&self, filter: &FilterExpr) -> f64 {
let cost = match filter {
FilterExpr::Eq { value, .. } | FilterExpr::Ne { value, .. } => {
self.value_compare_cost(value)
}
FilterExpr::Lt { value, .. }
| FilterExpr::Le { value, .. }
| FilterExpr::Gt { value, .. }
| FilterExpr::Ge { value, .. } => self.value_compare_cost(value),
FilterExpr::Between { lower, upper, .. } => {
self.value_compare_cost(lower) + self.value_compare_cost(upper)
}
FilterExpr::In { values, .. } => {
self.config.cost_model.in_per_element * values.len() as f64
}
FilterExpr::Like { pattern, .. } => {
self.config.cost_model.pattern_match
+ self.config.cost_model.string_per_byte * pattern.len() as f64
}
FilterExpr::IsNull { .. } | FilterExpr::IsNotNull { .. } => {
self.config.cost_model.null_check
}
FilterExpr::And(left, right) | FilterExpr::Or(left, right) => {
self.estimate_cost(left) + self.estimate_cost(right)
}
FilterExpr::Not(inner) => self.estimate_cost(inner),
FilterExpr::Function { args, .. } => {
let arg_cost: f64 = args.iter().map(|a| self.estimate_cost(a)).sum();
self.config.cost_model.function_call + arg_cost
}
FilterExpr::Subquery { .. } => self.config.cost_model.subquery,
FilterExpr::True | FilterExpr::False => 0.0,
};
cost.min(self.config.max_cost)
}
pub fn rank(&self, filters: Vec<FilterExpr>) -> Vec<RankedFilter> {
let mut ranked: Vec<RankedFilter> = filters
.into_iter()
.enumerate()
.map(|(i, f)| {
let selectivity = self.estimate_selectivity(&f);
let cost = self.estimate_cost(&f);
RankedFilter::new(f, selectivity, cost, i)
})
.collect();
ranked.sort();
ranked
}
pub fn reorder(&self, filters: Vec<FilterExpr>) -> Vec<FilterExpr> {
self.rank(filters).into_iter().map(|r| r.filter).collect()
}
fn estimate_equality_selectivity(&self, column: &str, _value: &FilterValue) -> f64 {
if self.config.use_statistics {
if let Some(ref stats) = self.table_stats {
if let Some(col_stats) = stats.get_column(column) {
if col_stats.ndv > 0 {
return 1.0 / col_stats.ndv as f64;
}
}
}
}
self.config.selectivity_model.default_equality
}
fn estimate_range_selectivity(
&self,
column: &str,
lower: Option<&FilterValue>,
upper: Option<&FilterValue>,
) -> f64 {
if self.config.use_statistics {
if let Some(ref stats) = self.table_stats {
if let Some(col_stats) = stats.get_column(column) {
if let (Some(min), Some(max)) = (col_stats.min_value, col_stats.max_value) {
if max > min {
let range = max - min;
let lower_bound = lower.and_then(|v| v.as_f64()).unwrap_or(min);
let upper_bound = upper.and_then(|v| v.as_f64()).unwrap_or(max);
let fraction = (upper_bound - lower_bound) / range;
return fraction.clamp(0.0, 1.0);
}
}
}
}
}
self.config.selectivity_model.default_range
}
fn estimate_in_selectivity(&self, column: &str, values: &[FilterValue]) -> f64 {
let single_sel = self.estimate_equality_selectivity(column, &values[0]);
(single_sel * values.len() as f64).min(1.0)
}
fn estimate_like_selectivity(&self, _column: &str, pattern: &str) -> f64 {
if !pattern.starts_with('%') && pattern.ends_with('%') {
self.config.selectivity_model.default_like * 0.5
} else if pattern.starts_with('%') && pattern.ends_with('%') {
self.config.selectivity_model.default_like * 2.0
} else {
self.config.selectivity_model.default_like
}
}
fn estimate_null_selectivity(&self, column: &str, is_null: bool) -> f64 {
if self.config.use_statistics {
if let Some(ref stats) = self.table_stats {
if let Some(col_stats) = stats.get_column(column) {
let null_fraction = col_stats.null_fraction;
return if is_null {
null_fraction
} else {
1.0 - null_fraction
};
}
}
}
if is_null {
self.config.selectivity_model.default_is_null
} else {
1.0 - self.config.selectivity_model.default_is_null
}
}
fn value_compare_cost(&self, value: &FilterValue) -> f64 {
match value {
FilterValue::String(s) => {
self.config.cost_model.base_compare
+ self.config.cost_model.string_per_byte * s.len() as f64
}
FilterValue::Bytes(b) => {
self.config.cost_model.base_compare
+ self.config.cost_model.string_per_byte * b.len() as f64
}
_ => self.config.cost_model.base_compare,
}
}
pub fn config(&self) -> &RankingConfig {
&self.config
}
}
impl Default for FilterRanker {
fn default() -> Self {
Self::with_defaults()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::optimizer::stats::ColumnStats;
#[test]
fn test_simple_ranking() {
let ranker = FilterRanker::with_defaults();
let filters = vec![
FilterExpr::Eq {
column: "id".to_string(),
value: FilterValue::Int(42),
},
FilterExpr::Like {
column: "name".to_string(),
pattern: "%test%".to_string(),
},
FilterExpr::IsNull {
column: "deleted_at".to_string(),
},
];
let ranked = ranker.rank(filters);
assert!(ranked[0].score < ranked[1].score);
assert!(ranked[1].score < ranked[2].score);
assert!(matches!(ranked[0].filter, FilterExpr::Eq { .. }));
}
#[test]
fn test_selectivity_estimation() {
let ranker = FilterRanker::with_defaults();
let eq_sel = ranker.estimate_selectivity(&FilterExpr::Eq {
column: "x".to_string(),
value: FilterValue::Int(1),
});
let ne_sel = ranker.estimate_selectivity(&FilterExpr::Ne {
column: "x".to_string(),
value: FilterValue::Int(1),
});
assert!((eq_sel + ne_sel - 1.0).abs() < 0.0001);
let and_sel = ranker.estimate_selectivity(&FilterExpr::And(
Box::new(FilterExpr::Eq {
column: "x".to_string(),
value: FilterValue::Int(1),
}),
Box::new(FilterExpr::Eq {
column: "y".to_string(),
value: FilterValue::Int(2),
}),
));
assert!((and_sel - eq_sel * eq_sel).abs() < 0.0001);
}
#[test]
fn test_cost_estimation() {
let ranker = FilterRanker::with_defaults();
let null_cost = ranker.estimate_cost(&FilterExpr::IsNull {
column: "x".to_string(),
});
let string_cost = ranker.estimate_cost(&FilterExpr::Eq {
column: "x".to_string(),
value: FilterValue::String("a long string value".to_string()),
});
let subquery_cost = ranker.estimate_cost(&FilterExpr::Subquery { id: 1 });
assert!(null_cost < string_cost);
assert!(string_cost < subquery_cost);
}
#[test]
fn test_with_statistics() {
let mut stats = TableStats::new("test".to_string(), 10000);
stats.add_column(ColumnStats {
name: "status".to_string(),
ndv: 5, null_fraction: 0.0,
min_value: None,
max_value: None,
});
let ranker = FilterRanker::with_defaults().with_stats(stats);
let sel = ranker.estimate_selectivity(&FilterExpr::Eq {
column: "status".to_string(),
value: FilterValue::String("active".to_string()),
});
assert!((sel - 0.2).abs() < 0.01);
}
#[test]
fn test_reorder() {
let ranker = FilterRanker::with_defaults();
let filters = vec![
FilterExpr::Subquery { id: 1 }, FilterExpr::IsNull {
column: "x".to_string(),
}, FilterExpr::Function {
name: "expensive_fn".to_string(),
args: vec![],
}, ];
let reordered = ranker.reorder(filters);
assert!(matches!(reordered[0], FilterExpr::Subquery { .. }));
assert!(matches!(reordered[1], FilterExpr::Function { .. }));
assert!(matches!(reordered[2], FilterExpr::IsNull { .. }));
}
#[test]
fn test_in_selectivity() {
let ranker = FilterRanker::with_defaults();
let in_1 = ranker.estimate_selectivity(&FilterExpr::In {
column: "x".to_string(),
values: vec![FilterValue::Int(1)],
});
let in_5 = ranker.estimate_selectivity(&FilterExpr::In {
column: "x".to_string(),
values: vec![
FilterValue::Int(1),
FilterValue::Int(2),
FilterValue::Int(3),
FilterValue::Int(4),
FilterValue::Int(5),
],
});
assert!(in_5 > in_1);
}
#[test]
fn test_like_patterns() {
let ranker = FilterRanker::with_defaults();
let prefix = ranker.estimate_selectivity(&FilterExpr::Like {
column: "x".to_string(),
pattern: "test%".to_string(),
});
let contains = ranker.estimate_selectivity(&FilterExpr::Like {
column: "x".to_string(),
pattern: "%test%".to_string(),
});
assert!(prefix < contains);
}
}