use fsqlite_types::value::SqliteValue;
use serde::{Deserialize, Serialize};
use std::cmp::Ordering;
use std::collections::HashMap;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct HistogramBucket {
pub lower: SqliteValue,
pub upper: SqliteValue,
pub count: u64,
pub ndv: u64,
}
impl HistogramBucket {
pub fn contains(&self, value: &SqliteValue) -> bool {
value >= &self.lower && value <= &self.upper
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
pub struct Histogram {
pub buckets: Vec<HistogramBucket>,
}
impl Histogram {
pub fn estimate_equality_rows(&self, value: &SqliteValue) -> f64 {
for bucket in &self.buckets {
if bucket.contains(value) {
let ndv = bucket.ndv.max(1) as f64;
return bucket.count as f64 / ndv;
}
}
1.0
}
pub fn estimate_less_than_rows(&self, value: &SqliteValue) -> f64 {
let mut count = 0.0;
for bucket in &self.buckets {
if value > &bucket.upper {
count += bucket.count as f64;
} else if value <= &bucket.lower {
break;
} else {
let fraction = interpolate_position(&bucket.lower, &bucket.upper, value);
count += bucket.count as f64 * fraction;
break;
}
}
count
}
pub fn estimate_greater_than_rows(&self, value: &SqliteValue) -> f64 {
let mut count = 0.0;
for bucket in self.buckets.iter().rev() {
if value < &bucket.lower {
count += bucket.count as f64;
} else if value >= &bucket.upper {
break;
} else {
let fraction = 1.0 - interpolate_position(&bucket.lower, &bucket.upper, value);
count += bucket.count as f64 * fraction;
break;
}
}
count
}
}
fn interpolate_position(min: &SqliteValue, max: &SqliteValue, val: &SqliteValue) -> f64 {
match (min, max, val) {
(SqliteValue::Integer(min_i), SqliteValue::Integer(max_i), SqliteValue::Integer(val_i)) => {
if max_i <= min_i {
return 0.5;
}
let range = (max_i - min_i) as f64;
let offset = (val_i - min_i) as f64;
(offset / range).clamp(0.0, 1.0)
}
(SqliteValue::Float(min_f), SqliteValue::Float(max_f), SqliteValue::Float(val_f)) => {
if max_f <= min_f {
return 0.5;
}
let range = max_f - min_f;
let offset = val_f - min_f;
(offset / range).clamp(0.0, 1.0)
}
_ => 0.5,
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
pub struct ColumnStats {
pub table_row_count: u64,
pub null_count: u64,
pub ndv: u64,
pub min_value: Option<SqliteValue>,
pub max_value: Option<SqliteValue>,
pub avg_width: f64,
pub histogram: Option<Histogram>,
}
impl ColumnStats {
pub fn estimate_selectivity(&self, op: &Operator, value: &SqliteValue) -> f64 {
if self.table_row_count == 0 {
return 0.0;
}
let non_null_count = self.table_row_count.saturating_sub(self.null_count) as f64;
if non_null_count <= 0.0 {
return 0.0;
}
let estimated_matches = match op {
Operator::Eq => {
if let Some(hist) = &self.histogram {
hist.estimate_equality_rows(value)
} else {
let ndv = self.ndv.max(1) as f64;
non_null_count / ndv
}
}
Operator::Lt => {
if let Some(hist) = &self.histogram {
hist.estimate_less_than_rows(value)
} else {
non_null_count / 3.0
}
}
Operator::Gt => {
if let Some(hist) = &self.histogram {
hist.estimate_greater_than_rows(value)
} else {
non_null_count / 3.0
}
}
Operator::Le => {
let lt = if let Some(hist) = &self.histogram {
hist.estimate_less_than_rows(value)
} else {
non_null_count / 3.0
};
let eq = if let Some(hist) = &self.histogram {
hist.estimate_equality_rows(value)
} else {
let ndv = self.ndv.max(1) as f64;
non_null_count / ndv
};
lt + eq
}
Operator::Ge => {
let gt = if let Some(hist) = &self.histogram {
hist.estimate_greater_than_rows(value)
} else {
non_null_count / 3.0
};
let eq = if let Some(hist) = &self.histogram {
hist.estimate_equality_rows(value)
} else {
let ndv = self.ndv.max(1) as f64;
non_null_count / ndv
};
gt + eq
}
Operator::Ne => {
let eq_sel = self.estimate_selectivity(&Operator::Eq, value);
non_null_count * (1.0 - eq_sel)
}
_ => non_null_count * 0.1, };
(estimated_matches / self.table_row_count as f64).clamp(0.0, 1.0)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Operator {
Eq,
Ne,
Lt,
Le,
Gt,
Ge,
Like,
Glob,
Is,
IsNot,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct TableStatistics {
pub row_count: u64,
pub columns: HashMap<String, ColumnStats>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum EstimationMethod {
Histogram,
Ndv,
Sampling,
Heuristic,
}
#[derive(Debug, Clone)]
pub struct CardinalityEstimate {
pub estimated_rows: f64,
pub selectivity: f64,
pub method: EstimationMethod,
}
impl ColumnStats {
pub fn estimate_cardinality(
&self,
op: &Operator,
value: &SqliteValue,
sample: Option<&[SqliteValue]>,
) -> CardinalityEstimate {
let rows = self.table_row_count as f64;
if rows <= 0.0 {
return CardinalityEstimate {
estimated_rows: 0.0,
selectivity: 0.0,
method: EstimationMethod::Heuristic,
};
}
if self.histogram.is_some() {
let sel = self.estimate_selectivity(op, value);
let span = tracing::debug_span!(
target: "fsqlite.planner",
"cost_estimate",
table = tracing::field::Empty,
estimated_rows = (sel * rows),
actual_method = "histogram",
);
let _g = span.enter();
return CardinalityEstimate {
estimated_rows: sel * rows,
selectivity: sel,
method: EstimationMethod::Histogram,
};
}
if let Some(sample) = sample {
if !sample.is_empty() {
let matching = sample
.iter()
.filter(|sv| cmp_matches(sv, *op, value))
.count();
let sel = (matching as f64 / sample.len() as f64).clamp(0.0, 1.0);
let span = tracing::debug_span!(
target: "fsqlite.planner",
"cost_estimate",
table = tracing::field::Empty,
estimated_rows = (sel * rows),
actual_method = "sampling",
);
let _g = span.enter();
return CardinalityEstimate {
estimated_rows: sel * rows,
selectivity: sel,
method: EstimationMethod::Sampling,
};
}
}
if matches!(op, Operator::Eq | Operator::Is) && self.ndv > 0 {
let sel = 1.0 / self.ndv as f64;
let span = tracing::debug_span!(
target: "fsqlite.planner",
"cost_estimate",
table = tracing::field::Empty,
estimated_rows = (sel * rows),
actual_method = "ndv",
);
let _g = span.enter();
return CardinalityEstimate {
estimated_rows: sel * rows,
selectivity: sel,
method: EstimationMethod::Ndv,
};
}
let sel = default_selectivity(*op);
let span = tracing::debug_span!(
target: "fsqlite.planner",
"cost_estimate",
table = tracing::field::Empty,
estimated_rows = (sel * rows),
actual_method = "heuristic",
);
let _g = span.enter();
CardinalityEstimate {
estimated_rows: sel * rows,
selectivity: sel,
method: EstimationMethod::Heuristic,
}
}
}
fn default_selectivity(op: Operator) -> f64 {
match op {
Operator::Eq | Operator::Is => 0.01, Operator::Ne | Operator::IsNot => 0.99,
Operator::Lt | Operator::Le | Operator::Gt | Operator::Ge => 1.0 / 3.0,
Operator::Like | Operator::Glob => 0.1,
}
}
fn cmp_matches(sample_val: &SqliteValue, op: Operator, probe: &SqliteValue) -> bool {
let ord = sample_val.partial_cmp(probe);
match op {
Operator::Eq | Operator::Is => ord == Some(Ordering::Equal),
Operator::Ne | Operator::IsNot => ord != Some(Ordering::Equal),
Operator::Lt => ord == Some(Ordering::Less),
Operator::Le => matches!(ord, Some(Ordering::Less | Ordering::Equal)),
Operator::Gt => ord == Some(Ordering::Greater),
Operator::Ge => matches!(ord, Some(Ordering::Greater | Ordering::Equal)),
Operator::Like | Operator::Glob => false, }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_histogram_interpolation_integer() {
let bucket = HistogramBucket {
lower: SqliteValue::Integer(0),
upper: SqliteValue::Integer(100),
count: 100,
ndv: 100,
};
let hist = Histogram { buckets: vec![bucket] };
let est = hist.estimate_less_than_rows(&SqliteValue::Integer(50));
assert!((est - 50.0).abs() < 1.0);
}
#[test]
fn test_selectivity_defaults() {
let stats = ColumnStats {
table_row_count: 1000,
null_count: 0,
ndv: 100,
min_value: Some(SqliteValue::Integer(0)),
max_value: Some(SqliteValue::Integer(1000)),
avg_width: 8.0,
histogram: None,
};
let sel = stats.estimate_selectivity(&Operator::Eq, &SqliteValue::Integer(50));
assert!((sel - 0.01).abs() < 0.001);
let sel = stats.estimate_selectivity(&Operator::Gt, &SqliteValue::Integer(50));
assert!((sel - 0.333).abs() < 0.001);
}
#[test]
fn test_cardinality_estimate_histogram_preferred() {
let hist = Histogram {
buckets: vec![HistogramBucket {
lower: SqliteValue::Integer(0),
upper: SqliteValue::Integer(100),
count: 1000,
ndv: 100,
}],
};
let stats = ColumnStats {
table_row_count: 1000,
null_count: 0,
ndv: 100,
min_value: Some(SqliteValue::Integer(0)),
max_value: Some(SqliteValue::Integer(100)),
avg_width: 8.0,
histogram: Some(hist),
};
let est = stats.estimate_cardinality(
&Operator::Eq,
&SqliteValue::Integer(50),
Some(&[SqliteValue::Integer(50)]), );
assert_eq!(est.method, EstimationMethod::Histogram);
assert!(est.estimated_rows > 0.0);
}
#[test]
fn test_cardinality_estimate_sampling_fallback() {
let stats = ColumnStats {
table_row_count: 1000,
null_count: 0,
ndv: 0,
min_value: None,
max_value: None,
avg_width: 0.0,
histogram: None,
};
let sample: Vec<SqliteValue> = (0..10)
.map(|i| SqliteValue::Integer(if i < 3 { 42 } else { i + 100 }))
.collect();
let est = stats.estimate_cardinality(
&Operator::Eq,
&SqliteValue::Integer(42),
Some(&sample),
);
assert_eq!(est.method, EstimationMethod::Sampling);
assert!((est.selectivity - 0.3).abs() < 0.01);
assert!((est.estimated_rows - 300.0).abs() < 1.0);
}
#[test]
fn test_cardinality_estimate_ndv_fallback() {
let stats = ColumnStats {
table_row_count: 1000,
null_count: 0,
ndv: 50,
min_value: None,
max_value: None,
avg_width: 0.0,
histogram: None,
};
let est = stats.estimate_cardinality(
&Operator::Eq,
&SqliteValue::Integer(42),
None,
);
assert_eq!(est.method, EstimationMethod::Ndv);
assert!((est.selectivity - 0.02).abs() < 0.001);
assert!((est.estimated_rows - 20.0).abs() < 0.1);
}
#[test]
fn test_cardinality_estimate_heuristic_fallback() {
let stats = ColumnStats {
table_row_count: 1000,
null_count: 0,
ndv: 0,
min_value: None,
max_value: None,
avg_width: 0.0,
histogram: None,
};
let est = stats.estimate_cardinality(
&Operator::Gt,
&SqliteValue::Integer(42),
None,
);
assert_eq!(est.method, EstimationMethod::Heuristic);
assert!((est.selectivity - 1.0 / 3.0).abs() < 0.01);
}
#[test]
fn test_default_selectivity_values() {
assert!((default_selectivity(Operator::Eq) - 0.01).abs() < 0.001);
assert!((default_selectivity(Operator::Ne) - 0.99).abs() < 0.001);
assert!((default_selectivity(Operator::Lt) - 0.333).abs() < 0.001);
assert!((default_selectivity(Operator::Like) - 0.1).abs() < 0.001);
}
#[test]
fn test_cmp_matches() {
let v50 = SqliteValue::Integer(50);
let v100 = SqliteValue::Integer(100);
assert!(cmp_matches(&v50, Operator::Eq, &v50));
assert!(!cmp_matches(&v50, Operator::Eq, &v100));
assert!(cmp_matches(&v50, Operator::Lt, &v100));
assert!(!cmp_matches(&v100, Operator::Lt, &v50));
assert!(cmp_matches(&v50, Operator::Le, &v50));
assert!(cmp_matches(&v100, Operator::Gt, &v50));
assert!(cmp_matches(&v100, Operator::Ge, &v100));
assert!(cmp_matches(&v50, Operator::Ne, &v100));
}
#[test]
fn test_estimation_method_hierarchy() {
let stats = ColumnStats {
table_row_count: 1000,
null_count: 0,
ndv: 50,
min_value: None,
max_value: None,
avg_width: 0.0,
histogram: None,
};
let sample = vec![SqliteValue::Integer(42); 10];
let est = stats.estimate_cardinality(
&Operator::Eq,
&SqliteValue::Integer(42),
Some(&sample),
);
assert_eq!(est.method, EstimationMethod::Sampling);
assert!((est.selectivity - 1.0).abs() < 0.01);
}
}