use std::collections::HashMap;
pub const DEFAULT_SAMPLE_TARGET: usize = 30_000;
pub const DEFAULT_HIST_BUCKETS: usize = 100;
pub const DEFAULT_MCV_SIZE: usize = 100;
#[derive(Debug, Clone, Default)]
pub struct ColumnAnalysis {
pub name: String,
pub distinct_count: u64,
pub null_count: u64,
pub total_count: u64,
pub mcv: Vec<(String, f64)>,
pub hist_bounds: Vec<String>,
pub min_value: Option<String>,
pub max_value: Option<String>,
}
#[derive(Debug, Clone, Default)]
pub struct TableAnalysis {
pub table: String,
pub row_count: u64,
pub avg_row_size: u64,
pub columns: Vec<ColumnAnalysis>,
pub elapsed_secs: f64,
}
#[derive(Debug, Clone, Copy)]
pub struct AnalyzeOptions {
pub sample_target: usize,
pub hist_buckets: usize,
pub mcv_size: usize,
pub analyse_all_columns: bool,
}
impl Default for AnalyzeOptions {
fn default() -> Self {
Self {
sample_target: DEFAULT_SAMPLE_TARGET,
hist_buckets: DEFAULT_HIST_BUCKETS,
mcv_size: DEFAULT_MCV_SIZE,
analyse_all_columns: false,
}
}
}
pub struct Reservoir {
capacity: usize,
samples: Vec<usize>,
rows_seen: u64,
rng_state: u64,
}
impl Reservoir {
pub fn new(capacity: usize, seed: u64) -> Self {
Self {
capacity,
samples: Vec::with_capacity(capacity),
rows_seen: 0,
rng_state: if seed == 0 { 0x9E3779B97F4A7C15 } else { seed },
}
}
pub fn observe(&mut self, row_index: usize) -> bool {
self.rows_seen += 1;
if self.samples.len() < self.capacity {
self.samples.push(row_index);
return true;
}
let r = self.next_u64() % self.rows_seen;
if (r as usize) < self.capacity {
self.samples[r as usize] = row_index;
return true;
}
false
}
fn next_u64(&mut self) -> u64 {
let mut x = self.rng_state;
x ^= x << 13;
x ^= x >> 7;
x ^= x << 17;
self.rng_state = x;
x
}
pub fn into_sorted_indices(mut self) -> Vec<usize> {
self.samples.sort_unstable();
self.samples
}
}
pub fn compute_column_stats(
column_names: &[String],
sampled_rows: &[Vec<Option<String>>],
total_count: u64,
opts: AnalyzeOptions,
) -> Vec<ColumnAnalysis> {
let mut out = Vec::with_capacity(column_names.len());
for (col_idx, name) in column_names.iter().enumerate() {
let mut null_count = 0u64;
let mut freq: HashMap<String, u64> = HashMap::new();
let mut values_in_order: Vec<String> = Vec::new();
for row in sampled_rows {
match row.get(col_idx) {
Some(Some(v)) => {
*freq.entry(v.clone()).or_insert(0) += 1;
values_in_order.push(v.clone());
}
_ => null_count += 1,
}
}
let distinct_count = freq.len() as u64;
let mut sorted_values = values_in_order.clone();
sorted_values.sort();
let min_value = sorted_values.first().cloned();
let max_value = sorted_values.last().cloned();
let sample_len = sampled_rows.len() as f64;
let mut mcv_pairs: Vec<(String, u64)> = freq.into_iter().collect();
mcv_pairs.sort_by_key(|b| std::cmp::Reverse(b.1));
mcv_pairs.truncate(opts.mcv_size);
let mcv: Vec<(String, f64)> = mcv_pairs
.into_iter()
.map(|(k, count)| (k, count as f64 / sample_len))
.collect();
let hist_bounds = if sorted_values.is_empty() {
Vec::new()
} else {
let boundaries = opts.hist_buckets + 1;
let mut bounds = Vec::with_capacity(boundaries);
for b in 0..boundaries {
let idx = ((b * (sorted_values.len() - 1)) / opts.hist_buckets)
.min(sorted_values.len() - 1);
bounds.push(sorted_values[idx].clone());
}
bounds
};
out.push(ColumnAnalysis {
name: name.clone(),
distinct_count,
null_count,
total_count,
mcv,
hist_bounds,
min_value,
max_value,
});
}
out
}
pub fn build_table_analysis(
table: impl Into<String>,
row_count: u64,
avg_row_size: u64,
columns: Vec<ColumnAnalysis>,
elapsed_secs: f64,
) -> TableAnalysis {
TableAnalysis {
table: table.into(),
row_count,
avg_row_size,
columns,
elapsed_secs,
}
}
pub fn run_analyze_from_sample(
table_name: impl Into<String>,
column_names: &[String],
sampled_rows: &[Vec<Option<String>>],
total_count: u64,
) -> TableAnalysis {
let opts = AnalyzeOptions::default();
let columns = compute_column_stats(column_names, sampled_rows, total_count, opts);
let avg_row_size = sampled_rows
.first()
.map(|row| {
row.iter()
.map(|v| v.as_ref().map(|s| s.len()).unwrap_or(0))
.sum::<usize>() as u64
})
.unwrap_or(0);
build_table_analysis(table_name, total_count, avg_row_size, columns, 0.0)
}