use std::collections::HashMap;
use serde::{Deserialize, Serialize};
use super::ml_predictor_training::TrainingExample;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ValueHistogram {
pub buckets: Vec<HistogramBucket>,
pub total_count: usize,
pub distinct_count: usize,
pub min_value: f64,
pub max_value: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HistogramBucket {
pub lower_bound: f64,
pub upper_bound: f64,
pub count: usize,
pub distinct_count: usize,
}
impl ValueHistogram {
pub fn from_data(data: &[f64], num_buckets: usize) -> Self {
if data.is_empty() {
return Self::empty();
}
let min_value = data.iter().copied().fold(f64::INFINITY, f64::min);
let max_value = data.iter().copied().fold(f64::NEG_INFINITY, f64::max);
if (max_value - min_value).abs() < 1e-10 {
return Self {
buckets: vec![HistogramBucket {
lower_bound: min_value,
upper_bound: max_value + 1e-10,
count: data.len(),
distinct_count: 1,
}],
total_count: data.len(),
distinct_count: 1,
min_value,
max_value,
};
}
let bucket_width = (max_value - min_value) / num_buckets as f64;
let mut buckets = Vec::with_capacity(num_buckets);
for i in 0..num_buckets {
let lower = min_value + i as f64 * bucket_width;
let upper = if i == num_buckets - 1 {
max_value + 1e-10 } else {
min_value + (i + 1) as f64 * bucket_width
};
buckets.push(HistogramBucket {
lower_bound: lower,
upper_bound: upper,
count: 0,
distinct_count: 0,
});
}
use std::collections::HashSet;
let mut bucket_distinct: Vec<HashSet<u64>> = vec![HashSet::new(); num_buckets];
for &value in data {
let bucket_idx = if value >= max_value {
num_buckets - 1
} else {
((value - min_value) / bucket_width).floor() as usize
};
if bucket_idx < num_buckets {
buckets[bucket_idx].count += 1;
let value_bits = value.to_bits();
bucket_distinct[bucket_idx].insert(value_bits);
}
}
for (i, bucket) in buckets.iter_mut().enumerate() {
bucket.distinct_count = bucket_distinct[i].len();
}
let distinct_count = bucket_distinct.iter().map(|s| s.len()).sum();
Self {
buckets,
total_count: data.len(),
distinct_count,
min_value,
max_value,
}
}
pub fn empty() -> Self {
Self {
buckets: Vec::new(),
total_count: 0,
distinct_count: 0,
min_value: 0.0,
max_value: 0.0,
}
}
pub fn estimate_equality_selectivity(&self, value: f64) -> f64 {
if self.total_count == 0 {
return 0.0;
}
for bucket in &self.buckets {
if value >= bucket.lower_bound && value < bucket.upper_bound {
if bucket.distinct_count == 0 {
return 0.0;
}
let selectivity =
bucket.count as f64 / bucket.distinct_count as f64 / self.total_count as f64;
return selectivity.min(1.0);
}
}
0.0
}
pub fn estimate_range_selectivity(&self, lower: f64, upper: f64) -> f64 {
if self.total_count == 0 || upper <= lower {
return 0.0;
}
let mut selected_count = 0.0;
for bucket in &self.buckets {
let overlap_lower = bucket.lower_bound.max(lower);
let overlap_upper = bucket.upper_bound.min(upper);
if overlap_upper > overlap_lower {
let bucket_width = bucket.upper_bound - bucket.lower_bound;
let overlap_width = overlap_upper - overlap_lower;
if bucket_width > 1e-10 {
let fraction = overlap_width / bucket_width;
selected_count += bucket.count as f64 * fraction;
}
}
}
(selected_count / self.total_count as f64).min(1.0)
}
pub fn estimate_cardinality(&self, selectivity: f64) -> usize {
(self.total_count as f64 * selectivity).ceil() as usize
}
pub fn avg_bucket_size(&self) -> f64 {
if self.buckets.is_empty() {
return 0.0;
}
self.total_count as f64 / self.buckets.len() as f64
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HistogramCardinalityEstimator {
pub histograms: HashMap<String, ValueHistogram>,
pub config: HistogramConfig,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HistogramConfig {
pub num_buckets: usize,
pub enabled: bool,
}
impl Default for HistogramConfig {
fn default() -> Self {
Self {
num_buckets: 100,
enabled: true,
}
}
}
impl HistogramCardinalityEstimator {
pub fn new(config: HistogramConfig) -> Self {
Self {
histograms: HashMap::new(),
config,
}
}
pub fn build_from_training_data(&mut self, training_data: &[TrainingExample]) {
if !self.config.enabled || training_data.is_empty() {
return;
}
let n_features = training_data[0].features.len();
for feature_idx in 0..n_features {
let feature_values: Vec<f64> = training_data
.iter()
.map(|example| example.features.get(feature_idx).copied().unwrap_or(0.0))
.collect();
let histogram = ValueHistogram::from_data(&feature_values, self.config.num_buckets);
let feature_name = format!("feature_{}", feature_idx);
self.histograms.insert(feature_name, histogram);
}
}
pub fn estimate_cardinality_with_histogram(&self, features: &[f64]) -> Option<f64> {
if !self.config.enabled || self.histograms.is_empty() {
return None;
}
let mut product = 1.0;
let mut count = 0;
for (i, &feature_value) in features.iter().enumerate() {
let feature_name = format!("feature_{}", i);
if let Some(histogram) = self.histograms.get(&feature_name) {
let selectivity = histogram.estimate_equality_selectivity(feature_value);
if selectivity > 0.0 {
product *= selectivity;
count += 1;
}
}
}
if count > 0 {
let geometric_mean = product.powf(1.0 / count as f64);
let avg_total: f64 = self
.histograms
.values()
.map(|h| h.total_count as f64)
.sum::<f64>()
/ self.histograms.len() as f64;
Some(avg_total * geometric_mean)
} else {
None
}
}
pub fn get_statistics(&self) -> HistogramStatistics {
let total_buckets: usize = self.histograms.values().map(|h| h.buckets.len()).sum();
let avg_distinct: f64 = if !self.histograms.is_empty() {
self.histograms
.values()
.map(|h| h.distinct_count as f64)
.sum::<f64>()
/ self.histograms.len() as f64
} else {
0.0
};
HistogramStatistics {
num_histograms: self.histograms.len(),
total_buckets,
avg_distinct_values: avg_distinct,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HistogramStatistics {
pub num_histograms: usize,
pub total_buckets: usize,
pub avg_distinct_values: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QueryCharacteristics {
pub triple_pattern_count: usize,
pub join_count: usize,
pub filter_count: usize,
pub optional_count: usize,
pub has_aggregation: bool,
pub has_sorting: bool,
pub estimated_cardinality: usize,
pub complexity_score: f64,
pub query_graph_diameter: usize,
pub avg_degree: f64,
pub max_degree: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FeatureExtractor {
pub(super) feature_weights: HashMap<String, f64>,
pub(super) normalization_params: Option<NormalizationParams>,
}
impl FeatureExtractor {
pub fn new() -> Self {
Self {
feature_weights: HashMap::new(),
normalization_params: None,
}
}
}
impl Default for FeatureExtractor {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NormalizationParams {
pub(super) mean: Vec<f64>,
pub(super) std_dev: Vec<f64>,
pub(super) min_values: Vec<f64>,
pub(super) max_values: Vec<f64>,
}
impl NormalizationParams {
pub fn new(
mean: Vec<f64>,
std_dev: Vec<f64>,
min_values: Vec<f64>,
max_values: Vec<f64>,
) -> Self {
Self {
mean,
std_dev,
min_values,
max_values,
}
}
pub fn len(&self) -> usize {
self.mean.len()
}
pub fn is_empty(&self) -> bool {
self.mean.is_empty()
}
pub fn mean(&self) -> &[f64] {
&self.mean
}
pub fn std_dev(&self) -> &[f64] {
&self.std_dev
}
pub fn min_values(&self) -> &[f64] {
&self.min_values
}
pub fn max_values(&self) -> &[f64] {
&self.max_values
}
}