use ahash::AHashMap;
use crate::core::Value;
pub const SYS_TABLE_STATS: &str = "_sys_table_stats";
pub const SYS_COLUMN_STATS: &str = "_sys_column_stats";
pub const CREATE_TABLE_STATS_SQL: &str = r#"
CREATE TABLE IF NOT EXISTS _sys_table_stats (
id INTEGER PRIMARY KEY AUTO_INCREMENT,
table_name TEXT NOT NULL UNIQUE,
row_count INTEGER NOT NULL DEFAULT 0,
page_count INTEGER NOT NULL DEFAULT 0,
avg_row_size INTEGER NOT NULL DEFAULT 0,
last_analyzed TIMESTAMP
)
"#;
pub const CREATE_COLUMN_STATS_SQL: &str = r#"
CREATE TABLE IF NOT EXISTS _sys_column_stats (
id INTEGER PRIMARY KEY AUTO_INCREMENT,
table_name TEXT NOT NULL,
column_name TEXT NOT NULL,
null_count INTEGER NOT NULL DEFAULT 0,
distinct_count INTEGER NOT NULL DEFAULT 0,
min_value TEXT,
max_value TEXT,
avg_width INTEGER NOT NULL DEFAULT 0,
histogram TEXT
)
"#;
pub const DEFAULT_HISTOGRAM_BUCKETS: usize = 32;
#[derive(Debug, Clone)]
pub struct Histogram {
pub boundaries: Vec<Value>,
pub rows_per_bucket: u64,
pub total_rows: u64,
}
impl Histogram {
pub fn from_sorted_values(values: &[Value], num_buckets: usize) -> Option<Self> {
if values.is_empty() || num_buckets == 0 {
return None;
}
let non_null_values: Vec<_> = values.iter().filter(|v| !v.is_null()).collect();
if non_null_values.is_empty() {
return None;
}
let total_rows = non_null_values.len() as u64;
let rows_per_bucket = (total_rows / num_buckets as u64).max(1);
let mut boundaries = Vec::with_capacity(num_buckets + 1);
boundaries.push(non_null_values[0].clone());
for i in 1..num_buckets {
let idx = (i * non_null_values.len() / num_buckets).min(non_null_values.len() - 1);
let boundary = non_null_values[idx].clone();
if boundaries.last() != Some(&boundary) {
boundaries.push(boundary);
}
}
let last_value = (*non_null_values.last().unwrap()).clone();
if boundaries.last() != Some(&last_value) {
boundaries.push(last_value);
}
Some(Self {
boundaries,
rows_per_bucket,
total_rows,
})
}
pub fn adaptive_from_sorted_values(
values: &[Value],
distinct_count: u64,
min_buckets: usize,
max_buckets: usize,
) -> Option<Self> {
if values.is_empty() {
return None;
}
let total_rows = values.iter().filter(|v| !v.is_null()).count() as u64;
if total_rows == 0 {
return None;
}
let distinct = distinct_count.max(1) as f64;
let rows = total_rows as f64;
let skew_factor = (rows / distinct + 1.0).log2();
let base_buckets = distinct.sqrt();
let optimal_buckets = (base_buckets * skew_factor).round() as usize;
let num_buckets = optimal_buckets.clamp(min_buckets, max_buckets);
Self::from_sorted_values(values, num_buckets)
}
pub fn estimate_selectivity(&self, value: &Value, operator: HistogramOp) -> f64 {
if self.boundaries.is_empty() || self.total_rows == 0 {
return 0.5; }
let num_buckets = self.boundaries.len().saturating_sub(1).max(1);
let bucket_idx = self.find_bucket(value);
match operator {
HistogramOp::Equal => {
1.0 / self.rows_per_bucket.max(1) as f64
}
HistogramOp::LessThan | HistogramOp::LessThanOrEqual => {
let full_buckets = bucket_idx as f64;
let bucket_fraction = self.fraction_in_bucket(value, bucket_idx);
let effective_buckets = full_buckets + bucket_fraction;
(effective_buckets / num_buckets as f64).clamp(0.0, 1.0)
}
HistogramOp::GreaterThan | HistogramOp::GreaterThanOrEqual => {
let remaining_buckets = (num_buckets - bucket_idx - 1) as f64;
let bucket_fraction = 1.0 - self.fraction_in_bucket(value, bucket_idx);
let effective_buckets = remaining_buckets + bucket_fraction;
(effective_buckets / num_buckets as f64).clamp(0.0, 1.0)
}
}
}
fn find_bucket(&self, value: &Value) -> usize {
if self.boundaries.is_empty() {
return 0;
}
let mut low = 0;
let mut high = self.boundaries.len().saturating_sub(1);
while low < high {
let mid = (low + high).div_ceil(2);
if mid < self.boundaries.len() && &self.boundaries[mid] <= value {
low = mid;
} else {
high = mid.saturating_sub(1);
}
}
low.min(self.boundaries.len().saturating_sub(2))
}
fn fraction_in_bucket(&self, value: &Value, bucket_idx: usize) -> f64 {
if bucket_idx >= self.boundaries.len().saturating_sub(1) {
return 1.0;
}
let lower = &self.boundaries[bucket_idx];
let upper = if bucket_idx + 1 < self.boundaries.len() {
&self.boundaries[bucket_idx + 1]
} else {
return 1.0;
};
match (lower, upper, value) {
(Value::Integer(lo), Value::Integer(hi), Value::Integer(v)) => {
if hi == lo {
0.5
} else {
((*v - *lo) as f64 / (*hi - *lo) as f64).clamp(0.0, 1.0)
}
}
(Value::Float(lo), Value::Float(hi), Value::Float(v)) => {
if (hi - lo).abs() < f64::EPSILON {
0.5
} else {
((v - lo) / (hi - lo)).clamp(0.0, 1.0)
}
}
_ => 0.5, }
}
pub fn estimate_range_selectivity(&self, low: &Value, high: &Value) -> f64 {
if self.boundaries.is_empty() || self.total_rows == 0 {
return 0.33; }
let num_buckets = self.boundaries.len().saturating_sub(1).max(1);
let low_bucket = self.find_bucket(low);
let high_bucket = self.find_bucket(high);
if low_bucket > high_bucket {
return 0.0;
}
if low_bucket == high_bucket {
let low_frac = self.fraction_in_bucket(low, low_bucket);
let high_frac = self.fraction_in_bucket(high, high_bucket);
let bucket_coverage = (high_frac - low_frac).max(0.0);
return (bucket_coverage / num_buckets as f64).clamp(0.0001, 1.0);
}
let mut total_coverage = 0.0;
let low_frac = self.fraction_in_bucket(low, low_bucket);
total_coverage += 1.0 - low_frac;
if high_bucket > low_bucket + 1 {
total_coverage += (high_bucket - low_bucket - 1) as f64;
}
let high_frac = self.fraction_in_bucket(high, high_bucket);
total_coverage += high_frac;
(total_coverage / num_buckets as f64).clamp(0.0001, 1.0)
}
pub fn to_json(&self) -> String {
let boundary_strs: Vec<String> = self.boundaries.iter().map(|v| v.to_string()).collect();
format!(
r#"{{"boundaries":[{}],"rows_per_bucket":{},"total_rows":{}}}"#,
boundary_strs
.iter()
.map(|s| format!("\"{}\"", s.replace('\\', "\\\\").replace('"', "\\\"")))
.collect::<Vec<_>>()
.join(","),
self.rows_per_bucket,
self.total_rows
)
}
pub fn from_json(json: &str) -> Option<Self> {
let json = json.trim();
if !json.starts_with('{') || !json.ends_with('}') {
return None;
}
let rows_per_bucket = extract_number(json, "rows_per_bucket")?;
let total_rows = extract_number(json, "total_rows")?;
let boundaries = extract_value_array(json, "boundaries")?;
Some(Self {
boundaries,
rows_per_bucket,
total_rows,
})
}
}
fn extract_number(json: &str, key: &str) -> Option<u64> {
let key_pattern = format!("\"{}\":", key);
let start = json.find(&key_pattern)? + key_pattern.len();
let rest = &json[start..];
let end = rest.find([',', '}'])?;
rest[..end].trim().parse().ok()
}
fn extract_value_array(json: &str, key: &str) -> Option<Vec<Value>> {
let key_pattern = format!("\"{}\":[", key);
let start = json.find(&key_pattern)? + key_pattern.len();
let rest = &json[start..];
let end = rest.find(']')?;
let array_content = &rest[..end];
let mut values = Vec::new();
for item in array_content.split(',') {
let item = item.trim();
if item.is_empty() {
continue;
}
let item = item.trim_matches('"');
if let Ok(i) = item.parse::<i64>() {
values.push(Value::Integer(i));
} else if let Ok(f) = item.parse::<f64>() {
values.push(Value::Float(f));
} else {
values.push(Value::Text(item.into()));
}
}
Some(values)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum HistogramOp {
Equal,
LessThan,
LessThanOrEqual,
GreaterThan,
GreaterThanOrEqual,
}
pub const DEFAULT_SAMPLE_SIZE: usize = 10000;
#[derive(Debug, Clone, Default)]
pub struct TableStats {
pub table_name: String,
pub row_count: u64,
pub page_count: u64,
pub avg_row_size: u64,
}
impl TableStats {
pub fn new(table_name: String) -> Self {
Self {
table_name,
row_count: 0,
page_count: 0,
avg_row_size: 0,
}
}
pub fn equality_selectivity(&self, distinct_count: u64) -> f64 {
if distinct_count > 0 {
1.0 / distinct_count as f64
} else if self.row_count > 0 {
1.0 / self.row_count as f64
} else {
0.1
}
}
}
#[derive(Debug, Clone, Default)]
pub struct ColumnStats {
pub column_name: String,
pub null_count: u64,
pub distinct_count: u64,
pub min_value: Option<Value>,
pub max_value: Option<Value>,
pub avg_width: u32,
pub histogram: Option<String>,
}
impl ColumnStats {
pub fn new(column_name: String) -> Self {
Self {
column_name,
null_count: 0,
distinct_count: 0,
min_value: None,
max_value: None,
avg_width: 0,
histogram: None,
}
}
pub fn is_empty(&self) -> bool {
self.distinct_count == 0 && self.min_value.is_none() && self.max_value.is_none()
}
pub fn parsed_histogram(&self) -> Option<Histogram> {
self.histogram
.as_ref()
.and_then(|json| Histogram::from_json(json))
}
pub fn set_histogram(&mut self, histogram: &Histogram) {
self.histogram = Some(histogram.to_json());
}
}
pub struct SelectivityEstimator;
impl SelectivityEstimator {
pub fn equality(distinct_count: u64) -> f64 {
if distinct_count > 0 {
1.0 / distinct_count as f64
} else {
0.1 }
}
pub fn histogram_join_cardinality(
left_stats: &ColumnStats,
right_stats: &ColumnStats,
left_rows: u64,
right_rows: u64,
) -> u64 {
let left_hist = left_stats.parsed_histogram();
let right_hist = right_stats.parsed_histogram();
match (left_hist, right_hist) {
(Some(lh), Some(rh)) => {
Self::join_cardinality_from_histograms(&lh, &rh, left_rows, right_rows)
}
_ => Self::join_cardinality_from_ranges(left_stats, right_stats, left_rows, right_rows),
}
}
fn join_cardinality_from_histograms(
left_hist: &Histogram,
right_hist: &Histogram,
left_rows: u64,
right_rows: u64,
) -> u64 {
if left_hist.boundaries.len() < 2 || right_hist.boundaries.len() < 2 {
return Self::join_cardinality(
left_rows,
right_rows,
left_hist.total_rows.max(1),
right_hist.total_rows.max(1),
);
}
let left_min = &left_hist.boundaries[0];
let left_max = &left_hist.boundaries[left_hist.boundaries.len() - 1];
let right_min = &right_hist.boundaries[0];
let right_max = &right_hist.boundaries[right_hist.boundaries.len() - 1];
let overall_overlap = Self::bucket_overlap(left_min, left_max, right_min, right_max);
if overall_overlap < 0.001 {
return 1;
}
let left_ndv = left_hist.total_rows;
let right_ndv = right_hist.total_rows;
let max_ndv = left_ndv.max(right_ndv).max(1);
let base_card = (left_rows as u128 * right_rows as u128 / max_ndv as u128) as f64;
let weighted_card = base_card * overall_overlap;
let min_input = left_rows.min(right_rows) as f64;
let final_card = weighted_card.min(min_input * overall_overlap * 1.5);
(final_card.ceil() as u64).max(1)
}
fn bucket_overlap(left_lo: &Value, left_hi: &Value, right_lo: &Value, right_hi: &Value) -> f64 {
let (llo, lhi) = match (left_lo, left_hi) {
(Value::Integer(a), Value::Integer(b)) => (*a as f64, *b as f64),
(Value::Float(a), Value::Float(b)) => (*a, *b),
_ => return 0.5, };
let (rlo, rhi) = match (right_lo, right_hi) {
(Value::Integer(a), Value::Integer(b)) => (*a as f64, *b as f64),
(Value::Float(a), Value::Float(b)) => (*a, *b),
_ => return 0.5,
};
if lhi < rlo || rhi < llo {
return 0.0;
}
let overlap_lo = llo.max(rlo);
let overlap_hi = lhi.min(rhi);
if overlap_hi <= overlap_lo {
return 0.0;
}
let left_width = (lhi - llo).abs().max(1.0);
let right_width = (rhi - rlo).abs().max(1.0);
let overlap_width = overlap_hi - overlap_lo;
let left_overlap_ratio = overlap_width / left_width;
let right_overlap_ratio = overlap_width / right_width;
(left_overlap_ratio * right_overlap_ratio).sqrt()
}
fn join_cardinality_from_ranges(
left_stats: &ColumnStats,
right_stats: &ColumnStats,
left_rows: u64,
right_rows: u64,
) -> u64 {
match (
&left_stats.min_value,
&left_stats.max_value,
&right_stats.min_value,
&right_stats.max_value,
) {
(Some(l_min), Some(l_max), Some(r_min), Some(r_max)) => {
if l_max < r_min || r_max < l_min {
return 0;
}
let overlap_ratio = Self::range_overlap_ratio(l_min, l_max, r_min, r_max);
if overlap_ratio < 0.001 {
return 0;
}
let naive = Self::join_cardinality(
left_rows,
right_rows,
left_stats.distinct_count.max(1),
right_stats.distinct_count.max(1),
);
((naive as f64 * overlap_ratio) as u64).max(1)
}
_ => {
Self::join_cardinality(
left_rows,
right_rows,
left_stats.distinct_count.max(1),
right_stats.distinct_count.max(1),
)
}
}
}
fn range_overlap_ratio(l_min: &Value, l_max: &Value, r_min: &Value, r_max: &Value) -> f64 {
let (llo, lhi) = match (l_min, l_max) {
(Value::Integer(a), Value::Integer(b)) => (*a as f64, *b as f64),
(Value::Float(a), Value::Float(b)) => (*a, *b),
_ => return 1.0, };
let (rlo, rhi) = match (r_min, r_max) {
(Value::Integer(a), Value::Integer(b)) => (*a as f64, *b as f64),
(Value::Float(a), Value::Float(b)) => (*a, *b),
_ => return 1.0,
};
let overlap_lo = llo.max(rlo);
let overlap_hi = lhi.min(rhi);
if overlap_hi <= overlap_lo {
return 0.0;
}
let total_range = (lhi - llo).abs().max(rhi - rlo).max(1.0);
let overlap_range = overlap_hi - overlap_lo;
(overlap_range / total_range).clamp(0.0, 1.0)
}
pub fn range() -> f64 {
0.33
}
pub fn range_with_histogram(col_stats: &ColumnStats, value: &Value, op: HistogramOp) -> f64 {
if let Some(histogram) = col_stats.parsed_histogram() {
return histogram.estimate_selectivity(value, op);
}
if let (Some(min_val), Some(max_val)) = (&col_stats.min_value, &col_stats.max_value) {
let fraction = Self::estimate_position(value, min_val, max_val);
return match op {
HistogramOp::Equal => 1.0 / col_stats.distinct_count.max(1) as f64,
HistogramOp::LessThan | HistogramOp::LessThanOrEqual => fraction,
HistogramOp::GreaterThan | HistogramOp::GreaterThanOrEqual => 1.0 - fraction,
};
}
match op {
HistogramOp::Equal => 0.1,
_ => 0.33,
}
}
fn estimate_position(value: &Value, min: &Value, max: &Value) -> f64 {
match (min, max, value) {
(Value::Integer(lo), Value::Integer(hi), Value::Integer(v)) => {
if hi == lo {
0.5
} else {
((*v - *lo) as f64 / (*hi - *lo) as f64).clamp(0.0, 1.0)
}
}
(Value::Float(lo), Value::Float(hi), Value::Float(v)) => {
if (hi - lo).abs() < f64::EPSILON {
0.5
} else {
((v - lo) / (hi - lo)).clamp(0.0, 1.0)
}
}
_ => 0.5, }
}
pub fn like(pattern: &str, distinct_count: u64) -> f64 {
if !pattern.starts_with('%') && pattern.ends_with('%') {
let prefix_len = pattern.len() - 1;
if distinct_count > 0 {
let prefix_selectivity = (26.0_f64).powi(-(prefix_len as i32));
return prefix_selectivity.max(1.0 / distinct_count as f64);
}
return 0.1;
}
if pattern.starts_with('%') {
return 0.25;
}
0.15 }
pub fn in_list(list_size: usize, distinct_count: u64) -> f64 {
if distinct_count > 0 {
(list_size as f64 / distinct_count as f64).min(1.0)
} else {
(list_size as f64 * 0.1).min(1.0)
}
}
pub fn is_null(null_count: u64, row_count: u64) -> f64 {
if row_count > 0 {
null_count as f64 / row_count as f64
} else {
0.01
}
}
pub fn is_not_null(null_count: u64, row_count: u64) -> f64 {
1.0 - Self::is_null(null_count, row_count)
}
pub fn join_cardinality(
left_rows: u64,
right_rows: u64,
left_distinct: u64,
right_distinct: u64,
) -> u64 {
let max_distinct = left_distinct.max(right_distinct).max(1);
(left_rows as u128 * right_rows as u128 / max_distinct as u128) as u64
}
}
pub fn is_stats_table(table_name: &str) -> bool {
let lower = table_name.to_lowercase();
lower == SYS_TABLE_STATS || lower == SYS_COLUMN_STATS
}
pub type CorrelationCoefficient = f64;
#[derive(Debug, Clone, Default)]
pub struct ColumnCorrelations {
correlations: AHashMap<(String, String), CorrelationCoefficient>,
functional_deps: AHashMap<String, Vec<String>>,
}
impl ColumnCorrelations {
pub fn new() -> Self {
Self::default()
}
pub fn add_correlation(&mut self, col1: &str, col2: &str, coefficient: CorrelationCoefficient) {
let (c1, c2) = Self::normalize_pair(col1, col2);
self.correlations.insert(
(c1.to_string(), c2.to_string()),
coefficient.clamp(-1.0, 1.0),
);
}
pub fn add_functional_dep(&mut self, determinant: &str, dependent: &str) {
self.functional_deps
.entry(determinant.to_string())
.or_default()
.push(dependent.to_string());
self.add_correlation(determinant, dependent, 1.0);
}
pub fn get_correlation(&self, col1: &str, col2: &str) -> CorrelationCoefficient {
let (c1, c2) = Self::normalize_pair(col1, col2);
self.correlations
.get(&(c1.to_string(), c2.to_string()))
.copied()
.unwrap_or(0.0)
}
pub fn determines(&self, col1: &str, col2: &str) -> bool {
self.functional_deps
.get(col1)
.map(|deps| deps.iter().any(|d| d == col2))
.unwrap_or(false)
}
pub fn correlated_columns(&self, col: &str) -> Vec<(&str, CorrelationCoefficient)> {
self.correlations
.iter()
.filter_map(|((c1, c2), coef)| {
if c1 == col {
Some((c2.as_str(), *coef))
} else if c2 == col {
Some((c1.as_str(), *coef))
} else {
None
}
})
.collect()
}
fn normalize_pair<'a>(col1: &'a str, col2: &'a str) -> (&'a str, &'a str) {
if col1 <= col2 {
(col1, col2)
} else {
(col2, col1)
}
}
pub fn combined_selectivity(&self, selectivities: &[(&str, f64)]) -> f64 {
if selectivities.is_empty() {
return 1.0;
}
if selectivities.len() == 1 {
return selectivities[0].1;
}
let mut result = selectivities
.iter()
.map(|(_, s)| *s)
.min_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.unwrap_or(1.0);
let mut used = vec![false; selectivities.len()];
for (i, (_, s)) in selectivities.iter().enumerate() {
if (*s - result).abs() < f64::EPSILON {
used[i] = true;
break;
}
}
for (i, (col_i, sel_i)) in selectivities.iter().enumerate() {
if used[i] {
continue;
}
let mut max_corr: f64 = 0.0;
for (j, (col_j, _)) in selectivities.iter().enumerate() {
if used[j] {
let corr = self.get_correlation(col_i, col_j).abs();
max_corr = max_corr.max(corr);
}
}
let adjusted_sel = if max_corr >= 0.99 {
1.0 - (1.0 - *sel_i) * (1.0 - max_corr)
} else if max_corr < 0.01 {
*sel_i
} else {
sel_i.powf(1.0 - max_corr)
};
result *= adjusted_sel;
used[i] = true;
}
result.clamp(1e-10, 1.0)
}
pub fn detect_correlations_from_sample(
col1_values: &[Value],
col2_values: &[Value],
) -> Option<CorrelationCoefficient> {
if col1_values.len() != col2_values.len() || col1_values.is_empty() {
return None;
}
let numeric1: Vec<f64> = col1_values.iter().filter_map(|v| v.as_float64()).collect();
let numeric2: Vec<f64> = col2_values.iter().filter_map(|v| v.as_float64()).collect();
if numeric1.len() == col1_values.len() && numeric2.len() == col2_values.len() {
return Some(Self::pearson_correlation(&numeric1, &numeric2));
}
Some(Self::categorical_correlation(col1_values, col2_values))
}
fn pearson_correlation(x: &[f64], y: &[f64]) -> f64 {
let n = x.len() as f64;
if n < 2.0 {
return 0.0;
}
let sum_x: f64 = x.iter().sum();
let sum_y: f64 = y.iter().sum();
let sum_xy: f64 = x.iter().zip(y.iter()).map(|(a, b)| a * b).sum();
let sum_x2: f64 = x.iter().map(|a| a * a).sum();
let sum_y2: f64 = y.iter().map(|a| a * a).sum();
let numerator = n * sum_xy - sum_x * sum_y;
let denominator = ((n * sum_x2 - sum_x * sum_x) * (n * sum_y2 - sum_y * sum_y)).sqrt();
if denominator.abs() < f64::EPSILON {
0.0
} else {
(numerator / denominator).clamp(-1.0, 1.0)
}
}
fn categorical_correlation(col1: &[Value], col2: &[Value]) -> f64 {
let mut pair_counts: AHashMap<(String, String), usize> = AHashMap::new();
let mut col1_counts: AHashMap<String, usize> = AHashMap::new();
let mut col2_counts: AHashMap<String, usize> = AHashMap::new();
for (v1, v2) in col1.iter().zip(col2.iter()) {
let s1 = v1.to_string();
let s2 = v2.to_string();
*pair_counts.entry((s1.clone(), s2.clone())).or_insert(0) += 1;
*col1_counts.entry(s1).or_insert(0) += 1;
*col2_counts.entry(s2).or_insert(0) += 1;
}
let n = col1.len() as f64;
let ndv1 = col1_counts.len() as f64;
let ndv2 = col2_counts.len() as f64;
if ndv1 <= 1.0 || ndv2 <= 1.0 {
return 0.0;
}
let mut chi_squared = 0.0;
for ((v1, v2), observed) in &pair_counts {
let expected = (col1_counts[v1] as f64 * col2_counts[v2] as f64) / n;
if expected > 0.0 {
chi_squared += (*observed as f64 - expected).powi(2) / expected;
}
}
let min_dim = (ndv1 - 1.0).min(ndv2 - 1.0).max(1.0);
let cramers_v = (chi_squared / (n * min_dim)).sqrt();
cramers_v.min(1.0)
}
}
impl SelectivityEstimator {
pub fn combined_selectivity_with_correlations(
selectivities: &[(&str, f64)],
correlations: Option<&ColumnCorrelations>,
) -> f64 {
match correlations {
Some(corr) => corr.combined_selectivity(selectivities),
None => {
selectivities.iter().map(|(_, s)| *s).product()
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_table_stats_new() {
let stats = TableStats::new("test_table".to_string());
assert_eq!(stats.table_name, "test_table");
assert_eq!(stats.row_count, 0);
}
#[test]
fn test_equality_selectivity() {
let sel = SelectivityEstimator::equality(100);
assert!((sel - 0.01).abs() < 0.001);
let sel_default = SelectivityEstimator::equality(0);
assert!((sel_default - 0.1).abs() < 0.001);
}
#[test]
fn test_in_list_selectivity() {
let sel = SelectivityEstimator::in_list(2, 5);
assert!((sel - 0.4).abs() < 0.001);
let sel_large = SelectivityEstimator::in_list(10, 5);
assert!((sel_large - 1.0).abs() < 0.001);
}
#[test]
fn test_null_selectivity() {
let sel = SelectivityEstimator::is_null(100, 1000);
assert!((sel - 0.1).abs() < 0.001);
let sel_not_null = SelectivityEstimator::is_not_null(100, 1000);
assert!((sel_not_null - 0.9).abs() < 0.001);
}
#[test]
fn test_join_cardinality() {
let join_card = SelectivityEstimator::join_cardinality(10000, 1000, 1000, 1000);
assert_eq!(join_card, 10000);
}
#[test]
fn test_like_selectivity() {
let sel_prefix = SelectivityEstimator::like("abc%", 1000);
assert!(sel_prefix < 0.1);
let sel_suffix = SelectivityEstimator::like("%abc", 1000);
assert!((sel_suffix - 0.25).abs() < 0.001);
}
#[test]
fn test_is_stats_table() {
assert!(is_stats_table("_sys_table_stats"));
assert!(is_stats_table("_SYS_TABLE_STATS"));
assert!(is_stats_table("_sys_column_stats"));
assert!(!is_stats_table("users"));
assert!(!is_stats_table("_sys_other"));
}
#[test]
fn test_column_stats_is_empty() {
let stats = ColumnStats::new("test".to_string());
assert!(stats.is_empty());
let mut stats2 = ColumnStats::new("test".to_string());
stats2.distinct_count = 10;
assert!(!stats2.is_empty());
}
#[test]
fn test_histogram_from_sorted_values() {
let values: Vec<Value> = (0..100).map(Value::Integer).collect();
let histogram = Histogram::from_sorted_values(&values, 10).unwrap();
assert!(!histogram.boundaries.is_empty());
assert_eq!(histogram.total_rows, 100);
assert_eq!(histogram.rows_per_bucket, 10);
assert_eq!(histogram.boundaries[0], Value::Integer(0));
}
#[test]
fn test_histogram_selectivity_estimation() {
let values: Vec<Value> = (0..100).map(Value::Integer).collect();
let histogram = Histogram::from_sorted_values(&values, 10).unwrap();
let sel_lt_50 = histogram.estimate_selectivity(&Value::Integer(50), HistogramOp::LessThan);
assert!(
sel_lt_50 > 0.4 && sel_lt_50 < 0.6,
"Expected ~0.5, got {}",
sel_lt_50
);
let sel_lt_10 = histogram.estimate_selectivity(&Value::Integer(10), HistogramOp::LessThan);
assert!(
sel_lt_10 > 0.05 && sel_lt_10 < 0.2,
"Expected ~0.1, got {}",
sel_lt_10
);
let sel_gt_90 =
histogram.estimate_selectivity(&Value::Integer(90), HistogramOp::GreaterThan);
assert!(
sel_gt_90 > 0.0 && sel_gt_90 < 0.2,
"Expected ~0.1, got {}",
sel_gt_90
);
}
#[test]
fn test_histogram_json_round_trip() {
let values: Vec<Value> = (0..100).map(Value::Integer).collect();
let histogram = Histogram::from_sorted_values(&values, 10).unwrap();
let json = histogram.to_json();
let parsed = Histogram::from_json(&json).expect("Failed to parse histogram JSON");
assert_eq!(parsed.total_rows, histogram.total_rows);
assert_eq!(parsed.rows_per_bucket, histogram.rows_per_bucket);
assert_eq!(parsed.boundaries.len(), histogram.boundaries.len());
}
#[test]
fn test_range_with_histogram() {
let values: Vec<Value> = (0..100).map(Value::Integer).collect();
let histogram = Histogram::from_sorted_values(&values, 10).unwrap();
let mut col_stats = ColumnStats::new("test".to_string());
col_stats.set_histogram(&histogram);
col_stats.min_value = Some(Value::Integer(0));
col_stats.max_value = Some(Value::Integer(99));
col_stats.distinct_count = 100;
let sel = SelectivityEstimator::range_with_histogram(
&col_stats,
&Value::Integer(50),
HistogramOp::LessThan,
);
assert!(sel > 0.4 && sel < 0.6, "Expected ~0.5, got {}", sel);
}
#[test]
fn test_histogram_empty_values() {
let values: Vec<Value> = vec![];
let histogram = Histogram::from_sorted_values(&values, 10);
assert!(histogram.is_none());
}
#[test]
fn test_histogram_with_nulls() {
use crate::core::DataType;
let mut values: Vec<Value> = (0..50).map(Value::Integer).collect();
values.extend((0..10).map(|_| Value::Null(DataType::Integer)));
values.extend((50..100).map(Value::Integer));
let histogram = Histogram::from_sorted_values(&values, 10).unwrap();
assert_eq!(histogram.total_rows, 100); }
#[test]
fn test_histogram_join_cardinality_full_overlap() {
let left_values: Vec<Value> = (0..100).map(Value::Integer).collect();
let right_values: Vec<Value> = (0..100).map(Value::Integer).collect();
let left_hist = Histogram::from_sorted_values(&left_values, 10).unwrap();
let right_hist = Histogram::from_sorted_values(&right_values, 10).unwrap();
let mut left_stats = ColumnStats::new("left_col".to_string());
left_stats.set_histogram(&left_hist);
left_stats.min_value = Some(Value::Integer(0));
left_stats.max_value = Some(Value::Integer(99));
left_stats.distinct_count = 100;
let mut right_stats = ColumnStats::new("right_col".to_string());
right_stats.set_histogram(&right_hist);
right_stats.min_value = Some(Value::Integer(0));
right_stats.max_value = Some(Value::Integer(99));
right_stats.distinct_count = 100;
let cardinality =
SelectivityEstimator::histogram_join_cardinality(&left_stats, &right_stats, 100, 100);
assert!(
(50..=200).contains(&cardinality),
"Expected ~100 rows for full overlap join, got {}",
cardinality
);
}
#[test]
fn test_histogram_join_cardinality_no_overlap() {
let left_values: Vec<Value> = (0..100).map(Value::Integer).collect();
let right_values: Vec<Value> = (200..300).map(Value::Integer).collect();
let left_hist = Histogram::from_sorted_values(&left_values, 10).unwrap();
let right_hist = Histogram::from_sorted_values(&right_values, 10).unwrap();
let mut left_stats = ColumnStats::new("left_col".to_string());
left_stats.set_histogram(&left_hist);
left_stats.min_value = Some(Value::Integer(0));
left_stats.max_value = Some(Value::Integer(99));
left_stats.distinct_count = 100;
let mut right_stats = ColumnStats::new("right_col".to_string());
right_stats.set_histogram(&right_hist);
right_stats.min_value = Some(Value::Integer(200));
right_stats.max_value = Some(Value::Integer(299));
right_stats.distinct_count = 100;
let cardinality =
SelectivityEstimator::histogram_join_cardinality(&left_stats, &right_stats, 100, 100);
assert!(
cardinality <= 10,
"Expected ~0 rows for non-overlapping join, got {}",
cardinality
);
}
#[test]
fn test_histogram_join_cardinality_partial_overlap() {
let left_values: Vec<Value> = (0..100).map(Value::Integer).collect();
let right_values: Vec<Value> = (50..150).map(Value::Integer).collect();
let left_hist = Histogram::from_sorted_values(&left_values, 10).unwrap();
let right_hist = Histogram::from_sorted_values(&right_values, 10).unwrap();
let mut left_stats = ColumnStats::new("left_col".to_string());
left_stats.set_histogram(&left_hist);
left_stats.min_value = Some(Value::Integer(0));
left_stats.max_value = Some(Value::Integer(99));
left_stats.distinct_count = 100;
let mut right_stats = ColumnStats::new("right_col".to_string());
right_stats.set_histogram(&right_hist);
right_stats.min_value = Some(Value::Integer(50));
right_stats.max_value = Some(Value::Integer(149));
right_stats.distinct_count = 100;
let cardinality =
SelectivityEstimator::histogram_join_cardinality(&left_stats, &right_stats, 100, 100);
assert!(
(20..=100).contains(&cardinality),
"Expected ~50 rows for 50% overlap join, got {}",
cardinality
);
}
#[test]
fn test_range_overlap_ratio() {
let ratio1 = SelectivityEstimator::range_overlap_ratio(
&Value::Integer(0),
&Value::Integer(100),
&Value::Integer(0),
&Value::Integer(100),
);
assert!(
(ratio1 - 1.0).abs() < 0.01,
"Full overlap should be 1.0, got {}",
ratio1
);
let ratio2 = SelectivityEstimator::range_overlap_ratio(
&Value::Integer(0),
&Value::Integer(50),
&Value::Integer(100),
&Value::Integer(150),
);
assert!(ratio2 < 0.01, "No overlap should be 0.0, got {}", ratio2);
let ratio3 = SelectivityEstimator::range_overlap_ratio(
&Value::Integer(0),
&Value::Integer(100),
&Value::Integer(50),
&Value::Integer(150),
);
assert!(
ratio3 > 0.3 && ratio3 < 0.7,
"Partial overlap should be ~0.5, got {}",
ratio3
);
}
#[test]
fn test_correlation_basic() {
let mut corr = ColumnCorrelations::new();
corr.add_correlation("city", "state", 0.95);
assert!((corr.get_correlation("city", "state") - 0.95).abs() < 0.001);
assert!((corr.get_correlation("state", "city") - 0.95).abs() < 0.001);
assert!((corr.get_correlation("city", "zip")).abs() < 0.001);
}
#[test]
fn test_functional_dependency() {
let mut corr = ColumnCorrelations::new();
corr.add_functional_dep("zip_code", "city");
assert!(corr.determines("zip_code", "city"));
assert!(!corr.determines("city", "zip_code"));
assert!((corr.get_correlation("zip_code", "city") - 1.0).abs() < 0.001);
}
#[test]
fn test_combined_selectivity_independent() {
let corr = ColumnCorrelations::new();
let selectivities = [("col1", 0.1), ("col2", 0.1)];
let combined = corr.combined_selectivity(&selectivities);
assert!(
combined > 0.005 && combined < 0.02,
"Independent columns: expected ~0.01, got {}",
combined
);
}
#[test]
fn test_combined_selectivity_correlated() {
let mut corr = ColumnCorrelations::new();
corr.add_correlation("city", "state", 0.99);
let selectivities = [("city", 0.02), ("state", 0.02)];
let combined = corr.combined_selectivity(&selectivities);
assert!(
combined > 0.01,
"Correlated columns: expected > 0.01, got {}",
combined
);
let naive = selectivities.iter().map(|(_, s)| *s).product::<f64>();
assert!(
combined > naive * 10.0,
"Correlated selectivity should be much higher than naive: {} vs {}",
combined,
naive
);
}
#[test]
fn test_combined_selectivity_partial_correlation() {
let mut corr = ColumnCorrelations::new();
corr.add_correlation("age", "income", 0.5);
let selectivities = [("age", 0.1), ("income", 0.1)];
let combined = corr.combined_selectivity(&selectivities);
let naive = 0.1 * 0.1; let max = 0.1;
assert!(
combined >= naive && combined <= max,
"Partial correlation: expected between {} and {}, got {}",
naive,
max,
combined
);
}
#[test]
fn test_pearson_correlation() {
let x = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let y = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let corr = ColumnCorrelations::pearson_correlation(&x, &y);
assert!(
(corr - 1.0).abs() < 0.001,
"Perfect correlation should be 1.0, got {}",
corr
);
let y_neg = vec![-1.0, -2.0, -3.0, -4.0, -5.0];
let corr_neg = ColumnCorrelations::pearson_correlation(&x, &y_neg);
assert!(
(corr_neg - (-1.0)).abs() < 0.001,
"Perfect negative correlation should be -1.0, got {}",
corr_neg
);
let y_random = vec![3.0, 1.0, 4.0, 1.0, 5.0];
let corr_random = ColumnCorrelations::pearson_correlation(&x, &y_random);
assert!(
corr_random.abs() < 0.8,
"Low correlation expected, got {}",
corr_random
);
}
#[test]
fn test_detect_correlations_numeric() {
let col1: Vec<Value> = (0..100).map(Value::Integer).collect();
let col2: Vec<Value> = (0..100).map(Value::Integer).collect();
let corr = ColumnCorrelations::detect_correlations_from_sample(&col1, &col2);
assert!(corr.is_some());
assert!(
(corr.unwrap() - 1.0).abs() < 0.01,
"Identical columns should have correlation ~1.0"
);
}
#[test]
fn test_detect_correlations_categorical() {
let cities = vec![
Value::Text("NYC".into()),
Value::Text("NYC".into()),
Value::Text("LA".into()),
Value::Text("LA".into()),
Value::Text("Chicago".into()),
];
let states = vec![
Value::Text("NY".into()),
Value::Text("NY".into()),
Value::Text("CA".into()),
Value::Text("CA".into()),
Value::Text("IL".into()),
];
let corr = ColumnCorrelations::detect_correlations_from_sample(&cities, &states);
assert!(corr.is_some());
assert!(
corr.unwrap() > 0.8,
"City/state should be highly correlated"
);
}
#[test]
fn test_selectivity_with_correlations_api() {
let mut corr = ColumnCorrelations::new();
corr.add_correlation("a", "b", 0.9);
let selectivities = [("a", 0.1), ("b", 0.1)];
let with_corr = SelectivityEstimator::combined_selectivity_with_correlations(
&selectivities,
Some(&corr),
);
let without_corr =
SelectivityEstimator::combined_selectivity_with_correlations(&selectivities, None);
assert!(
with_corr > without_corr,
"Correlated selectivity should be higher: {} vs {}",
with_corr,
without_corr
);
}
#[test]
fn test_histogram_between_range_selectivity() {
let values: Vec<Value> = (0..100).map(Value::Integer).collect();
let histogram = Histogram::from_sorted_values(&values, 10).unwrap();
let sel_25_75 =
histogram.estimate_range_selectivity(&Value::Integer(25), &Value::Integer(75));
assert!(
sel_25_75 > 0.4 && sel_25_75 < 0.65,
"Expected ~0.5 for BETWEEN 25 AND 75, got {}",
sel_25_75
);
let sel_0_10 =
histogram.estimate_range_selectivity(&Value::Integer(0), &Value::Integer(10));
assert!(
sel_0_10 > 0.05 && sel_0_10 < 0.2,
"Expected ~0.1 for BETWEEN 0 AND 10, got {}",
sel_0_10
);
let sel_90_100 =
histogram.estimate_range_selectivity(&Value::Integer(90), &Value::Integer(100));
assert!(
sel_90_100 > 0.05 && sel_90_100 < 0.2,
"Expected ~0.1 for BETWEEN 90 AND 100, got {}",
sel_90_100
);
let sel_full =
histogram.estimate_range_selectivity(&Value::Integer(0), &Value::Integer(100));
assert!(
sel_full > 0.9,
"Expected ~1.0 for full range, got {}",
sel_full
);
}
#[test]
fn test_histogram_between_single_bucket() {
let values: Vec<Value> = (0..100).map(Value::Integer).collect();
let histogram = Histogram::from_sorted_values(&values, 10).unwrap();
let sel_5_8 = histogram.estimate_range_selectivity(&Value::Integer(5), &Value::Integer(8));
assert!(
sel_5_8 > 0.0 && sel_5_8 < 0.15,
"Expected small selectivity for narrow range, got {}",
sel_5_8
);
}
#[test]
fn test_histogram_between_float_values() {
let values: Vec<Value> = (0..100).map(|i| Value::Float(i as f64)).collect();
let histogram = Histogram::from_sorted_values(&values, 10).unwrap();
let sel = histogram.estimate_range_selectivity(&Value::Float(25.0), &Value::Float(75.0));
assert!(
sel > 0.4 && sel < 0.65,
"Expected ~0.5 for BETWEEN 25.0 AND 75.0, got {}",
sel
);
}
#[test]
fn test_adaptive_histogram_high_cardinality() {
let values: Vec<Value> = (0..1000).map(Value::Integer).collect();
let histogram = Histogram::adaptive_from_sorted_values(&values, 1000, 10, 200).unwrap();
assert!(
histogram.boundaries.len() >= 10 && histogram.boundaries.len() <= 50,
"High cardinality should use moderate buckets, got {}",
histogram.boundaries.len()
);
}
#[test]
fn test_adaptive_histogram_low_cardinality() {
let mut values: Vec<Value> = Vec::with_capacity(10000);
for i in 0..10 {
for _ in 0..1000 {
values.push(Value::Integer(i));
}
}
values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let histogram = Histogram::adaptive_from_sorted_values(&values, 10, 10, 200).unwrap();
assert!(
histogram.boundaries.len() >= 10,
"Low cardinality should use minimum buckets, got {}",
histogram.boundaries.len()
);
}
#[test]
fn test_adaptive_histogram_medium_cardinality() {
let mut values: Vec<Value> = Vec::with_capacity(10000);
for i in 0..100 {
for _ in 0..100 {
values.push(Value::Integer(i));
}
}
values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let histogram = Histogram::adaptive_from_sorted_values(&values, 100, 10, 200).unwrap();
assert!(
histogram.boundaries.len() >= 10 && histogram.boundaries.len() <= 100,
"Medium cardinality should use proportional buckets, got {}",
histogram.boundaries.len()
);
}
#[test]
fn test_adaptive_histogram_respects_bounds() {
let values: Vec<Value> = (0..1000000).map(Value::Integer).collect();
let histogram_min =
Histogram::adaptive_from_sorted_values(&values, 1000000, 50, 200).unwrap();
assert!(
histogram_min.boundaries.len() >= 50,
"Should respect min_buckets, got {}",
histogram_min.boundaries.len()
);
let mut skewed: Vec<Value> = Vec::with_capacity(100000);
for i in 0..10 {
for _ in 0..10000 {
skewed.push(Value::Integer(i));
}
}
let histogram_max = Histogram::adaptive_from_sorted_values(&skewed, 10, 10, 50).unwrap();
assert!(
histogram_max.boundaries.len() <= 51, "Should respect max_buckets, got {}",
histogram_max.boundaries.len()
);
}
}