use fsqlite_types::value::SqliteValue;
use serde::{Deserialize, Serialize};
use std::cmp::Ordering;
use std::collections::HashMap;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct HistogramBucket {
pub lower: SqliteValue,
pub upper: SqliteValue,
pub count: u64,
pub ndv: u64,
}
impl HistogramBucket {
pub fn contains(&self, value: &SqliteValue) -> bool {
value >= &self.lower && value <= &self.upper
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
pub struct Histogram {
pub buckets: Vec<HistogramBucket>,
}
impl Histogram {
pub fn estimate_equality_rows(&self, value: &SqliteValue) -> f64 {
for bucket in &self.buckets {
if bucket.contains(value) {
let ndv = bucket.ndv.max(1) as f64;
return bucket.count as f64 / ndv;
}
}
1.0
}
pub fn estimate_less_than_rows(&self, value: &SqliteValue) -> f64 {
let mut count = 0.0;
for bucket in &self.buckets {
if value > &bucket.upper {
count += bucket.count as f64;
} else if value <= &bucket.lower {
break;
} else {
let fraction = interpolate_position(&bucket.lower, &bucket.upper, value);
count = (bucket.count as f64).mul_add(fraction, count);
break;
}
}
count
}
pub fn estimate_greater_than_rows(&self, value: &SqliteValue) -> f64 {
let mut count = 0.0;
for bucket in self.buckets.iter().rev() {
if value < &bucket.lower {
count += bucket.count as f64;
} else if value >= &bucket.upper {
break;
} else {
let fraction = 1.0 - interpolate_position(&bucket.lower, &bucket.upper, value);
count = (bucket.count as f64).mul_add(fraction, count);
break;
}
}
count
}
}
fn bytes_to_fraction(bytes: &[u8]) -> f64 {
let mut fraction = 0.0;
let mut weight = 1.0 / 256.0;
for &b in bytes.iter().take(8) {
fraction = f64::from(b).mul_add(weight, fraction);
weight /= 256.0;
}
fraction
}
fn interpolate_position(min: &SqliteValue, max: &SqliteValue, val: &SqliteValue) -> f64 {
match (min, max, val) {
(SqliteValue::Integer(min_i), SqliteValue::Integer(max_i), SqliteValue::Integer(val_i)) => {
if max_i <= min_i {
return 0.5;
}
let range = *max_i as f64 - *min_i as f64;
let offset = *val_i as f64 - *min_i as f64;
let fraction = offset / range;
if fraction.is_nan() {
0.5
} else {
fraction.clamp(0.0, 1.0)
}
}
(SqliteValue::Float(min_f), SqliteValue::Float(max_f), SqliteValue::Float(val_f)) => {
if max_f <= min_f || min_f.is_nan() || max_f.is_nan() || val_f.is_nan() {
return 0.5;
}
let range = max_f - min_f;
let offset = val_f - min_f;
let fraction = offset / range;
if fraction.is_nan() {
0.5
} else {
fraction.clamp(0.0, 1.0)
}
}
(SqliteValue::Text(min_s), SqliteValue::Text(max_s), SqliteValue::Text(val_s)) => {
if max_s <= min_s {
return 0.5;
}
let min_frac = bytes_to_fraction(min_s.as_bytes());
let max_frac = bytes_to_fraction(max_s.as_bytes());
let val_frac = bytes_to_fraction(val_s.as_bytes());
let range = max_frac - min_frac;
if range <= 0.0 {
return 0.5;
}
let offset = val_frac - min_frac;
(offset / range).clamp(0.0, 1.0)
}
(SqliteValue::Blob(min_b), SqliteValue::Blob(max_b), SqliteValue::Blob(val_b)) => {
if max_b <= min_b {
return 0.5;
}
let min_frac = bytes_to_fraction(min_b);
let max_frac = bytes_to_fraction(max_b);
let val_frac = bytes_to_fraction(val_b);
let range = max_frac - min_frac;
if range <= 0.0 {
return 0.5;
}
let offset = val_frac - min_frac;
(offset / range).clamp(0.0, 1.0)
}
_ => 0.5,
}
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
pub struct ColumnStats {
pub table_row_count: u64,
pub null_count: u64,
pub ndv: u64,
pub min_value: Option<SqliteValue>,
pub max_value: Option<SqliteValue>,
pub avg_width: f64,
pub histogram: Option<Histogram>,
}
impl ColumnStats {
pub fn estimate_selectivity(&self, op: &Operator, value: &SqliteValue) -> f64 {
if self.table_row_count == 0 {
return 0.0;
}
let non_null_count = self.table_row_count.saturating_sub(self.null_count) as f64;
if non_null_count <= 0.0 {
return 0.0;
}
let estimated_matches = match op {
Operator::Eq => {
if let Some(hist) = &self.histogram {
hist.estimate_equality_rows(value)
} else {
let ndv = self.ndv.max(1) as f64;
non_null_count / ndv
}
}
Operator::Lt => {
if let Some(hist) = &self.histogram {
hist.estimate_less_than_rows(value)
} else {
non_null_count / 3.0
}
}
Operator::Gt => {
if let Some(hist) = &self.histogram {
hist.estimate_greater_than_rows(value)
} else {
non_null_count / 3.0
}
}
Operator::Le => {
let lt = if let Some(hist) = &self.histogram {
hist.estimate_less_than_rows(value)
} else {
non_null_count / 3.0
};
let eq = if let Some(hist) = &self.histogram {
hist.estimate_equality_rows(value)
} else {
let ndv = self.ndv.max(1) as f64;
non_null_count / ndv
};
lt + eq
}
Operator::Ge => {
let gt = if let Some(hist) = &self.histogram {
hist.estimate_greater_than_rows(value)
} else {
non_null_count / 3.0
};
let eq = if let Some(hist) = &self.histogram {
hist.estimate_equality_rows(value)
} else {
let ndv = self.ndv.max(1) as f64;
non_null_count / ndv
};
gt + eq
}
Operator::Ne => {
let eq_matches = if let Some(hist) = &self.histogram {
hist.estimate_equality_rows(value)
} else {
let ndv = self.ndv.max(1) as f64;
non_null_count / ndv
};
(non_null_count - eq_matches).max(0.0)
}
_ => non_null_count * 0.1, };
if self.table_row_count == 0 {
return 0.0;
}
let fraction = estimated_matches / self.table_row_count as f64;
if fraction.is_nan() {
0.0
} else {
fraction.clamp(0.0, 1.0)
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Operator {
Eq,
Ne,
Lt,
Le,
Gt,
Ge,
Like,
Glob,
Is,
IsNot,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct TableStatistics {
pub row_count: u64,
pub columns: HashMap<String, ColumnStats>,
}
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct Stat1Summary {
pub n_rows: u64,
pub per_column_distinct: Vec<u64>,
}
#[must_use]
pub fn parse_stat1(stat: &str) -> Option<Stat1Summary> {
let mut parts = stat.split_ascii_whitespace();
let first = parts.next()?;
let n_rows: u64 = first.parse().ok()?;
let per_column_distinct: Vec<u64> = parts.filter_map(|t| t.parse::<u64>().ok()).collect();
Some(Stat1Summary {
n_rows,
per_column_distinct,
})
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum EstimationMethod {
Histogram,
Ndv,
Sampling,
Heuristic,
}
#[derive(Debug, Clone)]
pub struct CardinalityEstimate {
pub estimated_rows: f64,
pub selectivity: f64,
pub method: EstimationMethod,
}
impl ColumnStats {
pub fn estimate_cardinality(
&self,
op: &Operator,
value: &SqliteValue,
sample: Option<&[SqliteValue]>,
) -> CardinalityEstimate {
let rows = self.table_row_count as f64;
if rows <= 0.0 {
return CardinalityEstimate {
estimated_rows: 0.0,
selectivity: 0.0,
method: EstimationMethod::Heuristic,
};
}
if self.histogram.is_some() {
let sel = self.estimate_selectivity(op, value);
let span = tracing::debug_span!(
target: "fsqlite.planner",
"cost_estimate",
table = tracing::field::Empty,
estimated_rows = (sel * rows),
actual_method = "histogram",
);
let _g = span.enter();
return CardinalityEstimate {
estimated_rows: sel * rows,
selectivity: sel,
method: EstimationMethod::Histogram,
};
}
if let Some(sample) = sample {
if !sample.is_empty() {
let matching = sample
.iter()
.filter(|sv| cmp_matches(sv, *op, value))
.count();
let sel = (matching as f64 / sample.len() as f64).clamp(0.0, 1.0);
let span = tracing::debug_span!(
target: "fsqlite.planner",
"cost_estimate",
table = tracing::field::Empty,
estimated_rows = (sel * rows),
actual_method = "sampling",
);
let _g = span.enter();
return CardinalityEstimate {
estimated_rows: sel * rows,
selectivity: sel,
method: EstimationMethod::Sampling,
};
}
}
if matches!(op, Operator::Eq | Operator::Is) && self.ndv > 0 {
let sel = 1.0 / self.ndv as f64;
let span = tracing::debug_span!(
target: "fsqlite.planner",
"cost_estimate",
table = tracing::field::Empty,
estimated_rows = (sel * rows),
actual_method = "ndv",
);
let _g = span.enter();
return CardinalityEstimate {
estimated_rows: sel * rows,
selectivity: sel,
method: EstimationMethod::Ndv,
};
}
let sel = default_selectivity(*op);
let span = tracing::debug_span!(
target: "fsqlite.planner",
"cost_estimate",
table = tracing::field::Empty,
estimated_rows = (sel * rows),
actual_method = "heuristic",
);
let _g = span.enter();
CardinalityEstimate {
estimated_rows: sel * rows,
selectivity: sel,
method: EstimationMethod::Heuristic,
}
}
}
fn default_selectivity(op: Operator) -> f64 {
match op {
Operator::Eq | Operator::Is => 0.01, Operator::Ne | Operator::IsNot => 0.99,
Operator::Lt | Operator::Le | Operator::Gt | Operator::Ge => 1.0 / 3.0,
Operator::Like | Operator::Glob => 0.1,
}
}
fn cmp_matches(sample_val: &SqliteValue, op: Operator, probe: &SqliteValue) -> bool {
let ord = sample_val.partial_cmp(probe);
match op {
Operator::Eq | Operator::Is => ord == Some(Ordering::Equal),
Operator::Ne | Operator::IsNot => ord != Some(Ordering::Equal),
Operator::Lt => ord == Some(Ordering::Less),
Operator::Le => matches!(ord, Some(Ordering::Less | Ordering::Equal)),
Operator::Gt => ord == Some(Ordering::Greater),
Operator::Ge => matches!(ord, Some(Ordering::Greater | Ordering::Equal)),
Operator::Like | Operator::Glob => false, }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_histogram_interpolation_integer() {
let bucket = HistogramBucket {
lower: SqliteValue::Integer(0),
upper: SqliteValue::Integer(100),
count: 100,
ndv: 100,
};
let hist = Histogram {
buckets: vec![bucket],
};
let est = hist.estimate_less_than_rows(&SqliteValue::Integer(50));
assert!((est - 50.0).abs() < 1.0);
}
#[test]
fn test_selectivity_defaults() {
let stats = ColumnStats {
table_row_count: 1000,
null_count: 0,
ndv: 100,
min_value: Some(SqliteValue::Integer(0)),
max_value: Some(SqliteValue::Integer(1000)),
avg_width: 8.0,
histogram: None,
};
let sel = stats.estimate_selectivity(&Operator::Eq, &SqliteValue::Integer(50));
assert!((sel - 0.01).abs() < 0.001);
let sel = stats.estimate_selectivity(&Operator::Gt, &SqliteValue::Integer(50));
assert!((sel - 0.333).abs() < 0.001);
}
#[test]
fn test_cardinality_estimate_histogram_preferred() {
let hist = Histogram {
buckets: vec![HistogramBucket {
lower: SqliteValue::Integer(0),
upper: SqliteValue::Integer(100),
count: 1000,
ndv: 100,
}],
};
let stats = ColumnStats {
table_row_count: 1000,
null_count: 0,
ndv: 100,
min_value: Some(SqliteValue::Integer(0)),
max_value: Some(SqliteValue::Integer(100)),
avg_width: 8.0,
histogram: Some(hist),
};
let est = stats.estimate_cardinality(
&Operator::Eq,
&SqliteValue::Integer(50),
Some(&[SqliteValue::Integer(50)]), );
assert_eq!(est.method, EstimationMethod::Histogram);
assert!(est.estimated_rows > 0.0);
}
#[test]
fn test_cardinality_estimate_sampling_fallback() {
let stats = ColumnStats {
table_row_count: 1000,
null_count: 0,
ndv: 0,
min_value: None,
max_value: None,
avg_width: 0.0,
histogram: None,
};
let sample: Vec<SqliteValue> = (0..10)
.map(|i| SqliteValue::Integer(if i < 3 { 42 } else { i + 100 }))
.collect();
let est =
stats.estimate_cardinality(&Operator::Eq, &SqliteValue::Integer(42), Some(&sample));
assert_eq!(est.method, EstimationMethod::Sampling);
assert!((est.selectivity - 0.3).abs() < 0.01);
assert!((est.estimated_rows - 300.0).abs() < 1.0);
}
#[test]
fn test_cardinality_estimate_ndv_fallback() {
let stats = ColumnStats {
table_row_count: 1000,
null_count: 0,
ndv: 50,
min_value: None,
max_value: None,
avg_width: 0.0,
histogram: None,
};
let est = stats.estimate_cardinality(&Operator::Eq, &SqliteValue::Integer(42), None);
assert_eq!(est.method, EstimationMethod::Ndv);
assert!((est.selectivity - 0.02).abs() < 0.001);
assert!((est.estimated_rows - 20.0).abs() < 0.1);
}
#[test]
fn test_cardinality_estimate_heuristic_fallback() {
let stats = ColumnStats {
table_row_count: 1000,
null_count: 0,
ndv: 0,
min_value: None,
max_value: None,
avg_width: 0.0,
histogram: None,
};
let est = stats.estimate_cardinality(&Operator::Gt, &SqliteValue::Integer(42), None);
assert_eq!(est.method, EstimationMethod::Heuristic);
assert!((est.selectivity - 1.0 / 3.0).abs() < 0.01);
}
#[test]
fn test_default_selectivity_values() {
assert!((default_selectivity(Operator::Eq) - 0.01).abs() < 0.001);
assert!((default_selectivity(Operator::Ne) - 0.99).abs() < 0.001);
assert!((default_selectivity(Operator::Lt) - 0.333).abs() < 0.001);
assert!((default_selectivity(Operator::Like) - 0.1).abs() < 0.001);
}
#[test]
fn test_cmp_matches() {
let v50 = SqliteValue::Integer(50);
let v100 = SqliteValue::Integer(100);
assert!(cmp_matches(&v50, Operator::Eq, &v50));
assert!(!cmp_matches(&v50, Operator::Eq, &v100));
assert!(cmp_matches(&v50, Operator::Lt, &v100));
assert!(!cmp_matches(&v100, Operator::Lt, &v50));
assert!(cmp_matches(&v50, Operator::Le, &v50));
assert!(cmp_matches(&v100, Operator::Gt, &v50));
assert!(cmp_matches(&v100, Operator::Ge, &v100));
assert!(cmp_matches(&v50, Operator::Ne, &v100));
}
#[test]
fn test_estimation_method_hierarchy() {
let stats = ColumnStats {
table_row_count: 1000,
null_count: 0,
ndv: 50,
min_value: None,
max_value: None,
avg_width: 0.0,
histogram: None,
};
let sample = vec![SqliteValue::Integer(42); 10];
let est =
stats.estimate_cardinality(&Operator::Eq, &SqliteValue::Integer(42), Some(&sample));
assert_eq!(est.method, EstimationMethod::Sampling);
assert!((est.selectivity - 1.0).abs() < 0.01);
}
#[test]
fn parse_stat1_table_only_row() {
let parsed = parse_stat1("12345").unwrap();
assert_eq!(parsed.n_rows, 12345);
assert!(parsed.per_column_distinct.is_empty());
}
#[test]
fn parse_stat1_index_row_with_distinct_counts() {
let parsed = parse_stat1("1000 10 1").unwrap();
assert_eq!(parsed.n_rows, 1000);
assert_eq!(parsed.per_column_distinct, vec![10, 1]);
}
#[test]
fn parse_stat1_tolerates_extra_whitespace() {
let parsed = parse_stat1(" 500\t 20 5 ").unwrap();
assert_eq!(parsed.n_rows, 500);
assert_eq!(parsed.per_column_distinct, vec![20, 5]);
}
#[test]
fn parse_stat1_rejects_empty_and_non_numeric() {
assert!(parse_stat1("").is_none());
assert!(parse_stat1(" ").is_none());
assert!(parse_stat1("not-a-number 1 2").is_none());
}
#[test]
fn parse_stat1_drops_trailing_garbage() {
let parsed = parse_stat1("42 7 foo 3").unwrap();
assert_eq!(parsed.n_rows, 42);
assert_eq!(parsed.per_column_distinct, vec![7, 3]);
}
}