use std::collections::HashMap;
use std::time::Instant;
#[derive(Debug, Clone)]
pub struct HistogramBucket {
pub lower_bound: String,
pub upper_bound: String,
pub count: u64,
pub distinct_count: u64,
pub null_count: u64,
}
impl HistogramBucket {
pub fn contains(&self, value: &str) -> bool {
value >= self.lower_bound.as_str() && value <= self.upper_bound.as_str()
}
pub fn overlap_fraction(&self, lo: &str, hi: &str) -> f64 {
if self.lower_bound.is_empty() && self.upper_bound.is_empty() {
return 1.0;
}
let lb = self.lower_bound.as_str();
let ub = self.upper_bound.as_str();
if hi < lb || lo > ub {
return 0.0;
}
if lo <= lb && hi >= ub {
return 1.0;
}
let to_pos = |s: &str| -> u64 {
let bytes = s.as_bytes();
let mut v: u64 = 0;
for (i, &b) in bytes.iter().enumerate().take(8) {
v |= (b as u64) << (56 - i * 8);
}
v
};
let pos_lb = to_pos(lb);
let pos_ub = to_pos(ub);
let pos_lo = to_pos(lo).max(pos_lb);
let pos_hi = to_pos(hi).min(pos_ub);
if pos_ub == pos_lb {
return 1.0;
}
let overlap = pos_hi.saturating_sub(pos_lo) as f64;
let total = (pos_ub - pos_lb) as f64;
(overlap / total).clamp(0.0, 1.0)
}
}
#[derive(Debug, Clone)]
pub struct PredicateHistogram {
pub predicate_iri: String,
pub total_count: u64,
pub null_count: u64,
pub distinct_count: u64,
pub buckets: Vec<HistogramBucket>,
}
impl PredicateHistogram {
pub fn new(predicate: &str, num_buckets: usize) -> Self {
Self {
predicate_iri: predicate.to_owned(),
total_count: 0,
null_count: 0,
distinct_count: 0,
buckets: Vec::with_capacity(num_buckets),
}
}
pub fn build_from_values(predicate: &str, values: &[String], num_buckets: usize) -> Self {
let num_buckets = num_buckets.max(1);
let null_count = values.iter().filter(|v| v.is_empty()).count() as u64;
let mut sorted: Vec<&String> = values.iter().filter(|v| !v.is_empty()).collect();
sorted.sort_unstable();
let total_count = sorted.len() as u64;
let mut distinct_count = 0u64;
{
let mut prev: Option<&str> = None;
for v in &sorted {
if prev != Some(v.as_str()) {
distinct_count += 1;
prev = Some(v.as_str());
}
}
}
let mut buckets: Vec<HistogramBucket> = Vec::with_capacity(num_buckets);
if sorted.is_empty() {
return Self {
predicate_iri: predicate.to_owned(),
total_count,
null_count,
distinct_count,
buckets,
};
}
let target_depth = (sorted.len() as f64 / num_buckets as f64).ceil() as usize;
let target_depth = target_depth.max(1);
let mut cursor = 0usize;
while cursor < sorted.len() {
let end = (cursor + target_depth).min(sorted.len());
let slice = &sorted[cursor..end];
let lower_bound = slice[0].clone();
let upper_bound = slice[slice.len() - 1].clone();
let mut bucket_distinct = 0u64;
let mut prev: Option<&str> = None;
for v in slice {
if prev != Some(v.as_str()) {
bucket_distinct += 1;
prev = Some(v.as_str());
}
}
buckets.push(HistogramBucket {
lower_bound,
upper_bound,
count: slice.len() as u64,
distinct_count: bucket_distinct,
null_count: 0,
});
cursor = end;
}
Self {
predicate_iri: predicate.to_owned(),
total_count,
null_count,
distinct_count,
buckets,
}
}
pub fn estimate_selectivity_eq(&self, value: &str) -> f64 {
if self.total_count == 0 {
return 0.0;
}
for bucket in &self.buckets {
if bucket.contains(value) {
let distinct = bucket.distinct_count.max(1) as f64;
let bucket_sel = bucket.count as f64 / self.total_count as f64;
return bucket_sel / distinct;
}
}
let denominator =
(self.distinct_count.max(1) + 1) as f64 * (self.total_count.max(1) as f64);
1.0 / denominator
}
pub fn estimate_selectivity_range(&self, lo: &str, hi: &str) -> f64 {
if self.total_count == 0 {
return 0.0;
}
let mut covered_rows: f64 = 0.0;
for bucket in &self.buckets {
let frac = bucket.overlap_fraction(lo, hi);
if frac > 0.0 {
covered_rows += frac * bucket.count as f64;
}
}
(covered_rows / self.total_count as f64).clamp(0.0, 1.0)
}
pub fn estimate_cardinality(&self, total_triples: u64, predicate_filter: Option<&str>) -> u64 {
let base = if self.total_count > 0 {
self.total_count
} else {
total_triples
};
if base == 0 {
return 0;
}
match predicate_filter {
None => base,
Some(value) => {
let sel = self.estimate_selectivity_eq(value);
((base as f64 * sel).ceil() as u64).max(1)
}
}
}
pub fn update_incremental(&mut self, new_values: &[String]) {
for value in new_values {
if value.is_empty() {
self.null_count += 1;
continue;
}
self.total_count += 1;
let idx = self
.buckets
.iter()
.rposition(|b| b.lower_bound.as_str() <= value.as_str())
.unwrap_or(0);
if let Some(bucket) = self.buckets.get_mut(idx) {
bucket.count += 1;
if value.as_str() < bucket.lower_bound.as_str() {
bucket.lower_bound = value.clone();
}
if value.as_str() > bucket.upper_bound.as_str() {
bucket.upper_bound = value.clone();
bucket.distinct_count += 1;
self.distinct_count += 1;
}
} else if !self.buckets.is_empty() {
let last = self
.buckets
.last_mut()
.expect("buckets non-empty checked above");
last.count += 1;
if value.as_str() > last.upper_bound.as_str() {
last.upper_bound = value.clone();
last.distinct_count += 1;
self.distinct_count += 1;
}
} else {
self.buckets.push(HistogramBucket {
lower_bound: value.clone(),
upper_bound: value.clone(),
count: 1,
distinct_count: 1,
null_count: 0,
});
self.distinct_count += 1;
}
}
}
pub fn num_buckets(&self) -> usize {
self.buckets.len()
}
}
pub struct DatasetStatistics {
histograms: HashMap<String, PredicateHistogram>,
total_triples: u64,
distinct_subjects: u64,
distinct_predicates: u64,
distinct_objects: u64,
last_updated: Instant,
num_buckets: usize,
}
impl DatasetStatistics {
pub fn new() -> Self {
Self::with_num_buckets(100)
}
pub fn with_num_buckets(num_buckets: usize) -> Self {
Self {
histograms: HashMap::new(),
total_triples: 0,
distinct_subjects: 0,
distinct_predicates: 0,
distinct_objects: 0,
last_updated: Instant::now(),
num_buckets: num_buckets.max(1),
}
}
pub fn record_triple(&mut self, predicate: &str, object: &str) {
self.total_triples += 1;
let num_buckets = self.num_buckets;
let histogram = self
.histograms
.entry(predicate.to_owned())
.or_insert_with(|| PredicateHistogram::new(predicate, num_buckets));
histogram.update_incremental(&[object.to_owned()]);
self.distinct_predicates = self.histograms.len() as u64;
self.last_updated = Instant::now();
}
pub fn get_histogram(&self, predicate: &str) -> Option<&PredicateHistogram> {
self.histograms.get(predicate)
}
pub fn estimate_pattern_cardinality(
&self,
subject: Option<&str>,
predicate: Option<&str>,
object: Option<&str>,
) -> u64 {
if self.total_triples == 0 {
return 0;
}
let subject_selectivity = if subject.is_some() {
if self.distinct_subjects > 1 {
1.0 / self.distinct_subjects as f64
} else {
0.001
}
} else {
1.0
};
let base_cardinality: f64 = match (predicate, object) {
(None, None) => self.total_triples as f64,
(Some(pred), None) => {
self.histograms
.get(pred)
.map(|h| h.total_count as f64)
.unwrap_or_else(|| {
if self.distinct_predicates > 0 {
self.total_triples as f64 / self.distinct_predicates as f64
} else {
self.total_triples as f64 * 0.1
}
})
}
(Some(pred), Some(obj)) => {
match self.histograms.get(pred) {
Some(hist) => {
let sel = hist.estimate_selectivity_eq(obj);
(hist.total_count as f64 * sel).max(1.0)
}
None => 1.0, }
}
(None, Some(obj)) => {
if self.histograms.is_empty() {
return 0;
}
self.histograms
.values()
.map(|hist| {
let sel = hist.estimate_selectivity_eq(obj);
hist.total_count as f64 * sel
})
.sum::<f64>()
.max(1.0)
}
};
((base_cardinality * subject_selectivity).ceil() as u64).max(1)
}
pub fn rebuild_histograms(&mut self, values: &[(String, String)]) {
self.histograms.clear();
self.total_triples = values.len() as u64;
let mut by_predicate: HashMap<&str, Vec<&String>> = HashMap::new();
for (pred, obj) in values {
by_predicate.entry(pred.as_str()).or_default().push(obj);
}
let num_buckets = self.num_buckets;
for (pred, objs) in &by_predicate {
let owned_objs: Vec<String> = objs.iter().map(|s| (*s).clone()).collect();
let histogram = PredicateHistogram::build_from_values(pred, &owned_objs, num_buckets);
self.histograms.insert(pred.to_string(), histogram);
}
self.distinct_predicates = self.histograms.len() as u64;
self.distinct_objects = self.histograms.values().map(|h| h.distinct_count).sum();
self.last_updated = Instant::now();
}
pub fn total_triples(&self) -> u64 {
self.total_triples
}
pub fn set_distinct_subjects(&mut self, count: u64) {
self.distinct_subjects = count;
}
pub fn set_distinct_objects(&mut self, count: u64) {
self.distinct_objects = count;
}
pub fn last_updated(&self) -> Instant {
self.last_updated
}
pub fn predicate_count(&self) -> usize {
self.histograms.len()
}
}
impl Default for DatasetStatistics {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bucket_contains() {
let bucket = HistogramBucket {
lower_bound: "apple".to_string(),
upper_bound: "mango".to_string(),
count: 10,
distinct_count: 5,
null_count: 0,
};
assert!(bucket.contains("apple"));
assert!(bucket.contains("cherry"));
assert!(bucket.contains("mango"));
assert!(!bucket.contains("aardvark")); assert!(!bucket.contains("zebra")); }
#[test]
fn test_bucket_overlap_fraction_full() {
let bucket = HistogramBucket {
lower_bound: "b".to_string(),
upper_bound: "m".to_string(),
count: 100,
distinct_count: 50,
null_count: 0,
};
let frac = bucket.overlap_fraction("a", "z");
assert!(frac > 0.99, "Expected ~1.0 but got {}", frac);
}
#[test]
fn test_bucket_overlap_fraction_none() {
let bucket = HistogramBucket {
lower_bound: "m".to_string(),
upper_bound: "z".to_string(),
count: 100,
distinct_count: 50,
null_count: 0,
};
let frac = bucket.overlap_fraction("a", "b");
assert_eq!(frac, 0.0);
}
#[test]
fn test_build_from_empty_values() {
let hist = PredicateHistogram::build_from_values("http://example.org/p", &[], 10);
assert_eq!(hist.total_count, 0);
assert_eq!(hist.buckets.len(), 0);
}
#[test]
fn test_build_from_values_single_bucket() {
let values: Vec<String> = (0..5).map(|i| format!("value{}", i)).collect();
let hist = PredicateHistogram::build_from_values("p", &values, 1);
assert_eq!(hist.total_count, 5);
assert_eq!(hist.buckets.len(), 1);
assert_eq!(hist.buckets[0].count, 5);
}
#[test]
fn test_build_from_values_multiple_buckets() {
let values: Vec<String> = (0..100).map(|i| format!("value{:03}", i)).collect();
let hist = PredicateHistogram::build_from_values("p", &values, 10);
assert_eq!(hist.total_count, 100);
assert_eq!(hist.distinct_count, 100);
assert!(hist.buckets.len() >= 9 && hist.buckets.len() <= 11);
let sum: u64 = hist.buckets.iter().map(|b| b.count).sum();
assert_eq!(sum, 100);
}
#[test]
fn test_build_handles_nulls() {
let mut values: Vec<String> = (0..10).map(|i| format!("v{}", i)).collect();
values.push(String::new()); values.push(String::new()); let hist = PredicateHistogram::build_from_values("p", &values, 5);
assert_eq!(hist.null_count, 2);
assert_eq!(hist.total_count, 10);
}
#[test]
fn test_estimate_selectivity_eq_within_bucket() {
let values: Vec<String> = vec![
"alpha".to_string(),
"beta".to_string(),
"gamma".to_string(),
"delta".to_string(),
"epsilon".to_string(),
];
let hist = PredicateHistogram::build_from_values("p", &values, 2);
let sel = hist.estimate_selectivity_eq("beta");
assert!(sel > 0.0 && sel <= 1.0, "Selectivity {} out of range", sel);
}
#[test]
fn test_estimate_selectivity_eq_missing_value() {
let values: Vec<String> = (0..10).map(|i| format!("value{}", i)).collect();
let hist = PredicateHistogram::build_from_values("p", &values, 5);
let sel = hist.estimate_selectivity_eq("zzz_not_in_dataset");
assert!(
sel < 0.01,
"Selectivity for missing value too high: {}",
sel
);
}
#[test]
fn test_estimate_selectivity_range() {
let values: Vec<String> = (0..100).map(|i| format!("{:03}", i)).collect();
let hist = PredicateHistogram::build_from_values("p", &values, 10);
let sel = hist.estimate_selectivity_range("000", "049");
assert!(
sel > 0.3 && sel < 0.7,
"Range selectivity {} unexpected",
sel
);
}
#[test]
fn test_estimate_cardinality_no_filter() {
let values: Vec<String> = (0..50).map(|i| format!("v{}", i)).collect();
let hist = PredicateHistogram::build_from_values("p", &values, 5);
let card = hist.estimate_cardinality(1000, None);
assert_eq!(card, 50);
}
#[test]
fn test_estimate_cardinality_with_filter() {
let values: Vec<String> = (0..100).map(|i| format!("{:03}", i)).collect();
let hist = PredicateHistogram::build_from_values("p", &values, 10);
let card = hist.estimate_cardinality(1000, Some("050"));
assert!(card >= 1);
}
#[test]
fn test_update_incremental() {
let initial: Vec<String> = (0..10).map(|i| format!("v{}", i)).collect();
let mut hist = PredicateHistogram::build_from_values("p", &initial, 3);
let before_count = hist.total_count;
let new_vals: Vec<String> = vec!["newA".to_string(), "newB".to_string()];
hist.update_incremental(&new_vals);
assert_eq!(hist.total_count, before_count + 2);
}
#[test]
fn test_update_incremental_null() {
let initial: Vec<String> = (0..5).map(|i| format!("v{}", i)).collect();
let mut hist = PredicateHistogram::build_from_values("p", &initial, 2);
hist.update_incremental(&[String::new()]);
assert_eq!(hist.null_count, 1);
}
#[test]
fn test_dataset_statistics_new() {
let stats = DatasetStatistics::new();
assert_eq!(stats.total_triples(), 0);
assert_eq!(stats.predicate_count(), 0);
}
#[test]
fn test_dataset_record_triple() {
let mut stats = DatasetStatistics::new();
stats.record_triple("http://example.org/age", "42");
stats.record_triple("http://example.org/age", "25");
stats.record_triple("http://example.org/name", "Alice");
assert_eq!(stats.total_triples(), 3);
assert_eq!(stats.predicate_count(), 2);
assert!(stats.get_histogram("http://example.org/age").is_some());
}
#[test]
fn test_dataset_estimate_pattern_cardinality_no_bindings() {
let mut stats = DatasetStatistics::new();
for i in 0..20 {
stats.record_triple("http://p", &format!("obj{}", i));
}
let card = stats.estimate_pattern_cardinality(None, None, None);
assert_eq!(card, 20);
}
#[test]
fn test_dataset_estimate_pattern_cardinality_known_predicate() {
let mut stats = DatasetStatistics::new();
for i in 0..30 {
stats.record_triple("http://p/name", &format!("name{}", i));
}
for i in 0..10 {
stats.record_triple("http://p/age", &format!("{}", i));
}
let card = stats.estimate_pattern_cardinality(None, Some("http://p/name"), None);
assert_eq!(card, 30);
}
#[test]
fn test_dataset_estimate_pattern_cardinality_eq_filter() {
let mut stats = DatasetStatistics::with_num_buckets(5);
for i in 0..100 {
stats.record_triple("http://p/color", &format!("color{:03}", i));
}
let card =
stats.estimate_pattern_cardinality(None, Some("http://p/color"), Some("color050"));
assert!(card >= 1, "Equality filter should return at least 1");
}
#[test]
fn test_dataset_rebuild_histograms() {
let mut stats = DatasetStatistics::new();
let pairs: Vec<(String, String)> = (0..200)
.map(|i| ("http://p".to_string(), format!("v{:04}", i)))
.collect();
stats.rebuild_histograms(&pairs);
assert_eq!(stats.total_triples(), 200);
assert_eq!(stats.predicate_count(), 1);
let hist = stats
.get_histogram("http://p")
.expect("histogram must exist");
assert_eq!(hist.total_count, 200);
}
#[test]
fn test_dataset_set_distinct_subjects() {
let mut stats = DatasetStatistics::new();
stats.record_triple("p", "o");
stats.set_distinct_subjects(500);
let with_subject = stats.estimate_pattern_cardinality(Some("s"), Some("p"), None);
let without_subject = stats.estimate_pattern_cardinality(None, Some("p"), None);
assert!(with_subject <= without_subject);
}
#[test]
fn test_dataset_unknown_predicate_estimate() {
let mut stats = DatasetStatistics::new();
for i in 0..90 {
stats.record_triple("http://p/known", &format!("v{}", i));
}
let card = stats.estimate_pattern_cardinality(None, Some("http://p/unknown"), None);
assert!(card >= 1);
}
}