use super::histogram::Histogram;
use grafeo_common::types::Value;
use std::collections::HashMap;
#[derive(Debug, Clone, Default)]
pub struct RdfStatistics {
pub total_triples: u64,
pub subject_count: u64,
pub predicate_count: u64,
pub object_count: u64,
pub predicates: HashMap<String, PredicateStatistics>,
pub subject_histogram: Option<Histogram>,
pub object_histogram: Option<Histogram>,
pub index_stats: IndexStatistics,
}
impl RdfStatistics {
pub fn new() -> Self {
Self::default()
}
pub fn update_from_counts(
&mut self,
total_triples: u64,
subject_count: u64,
predicate_count: u64,
object_count: u64,
) {
self.total_triples = total_triples;
self.subject_count = subject_count;
self.predicate_count = predicate_count;
self.object_count = object_count;
}
pub fn update_predicate(&mut self, predicate: &str, stats: PredicateStatistics) {
self.predicates.insert(predicate.to_string(), stats);
}
pub fn get_predicate(&self, predicate: &str) -> Option<&PredicateStatistics> {
self.predicates.get(predicate)
}
pub fn estimate_triple_pattern_cardinality(
&self,
subject_bound: bool,
predicate_bound: Option<&str>,
object_bound: bool,
) -> f64 {
if self.total_triples == 0 {
return 0.0;
}
let base = self.total_triples as f64;
match (subject_bound, predicate_bound, object_bound) {
(true, Some(_), true) => 1.0,
(true, Some(pred), false) => {
if let Some(stats) = self.predicates.get(pred) {
stats.avg_objects_per_subject()
} else {
10.0
}
}
(true, None, true) => {
self.predicate_count as f64
}
(true, None, false) => {
base / self.subject_count.max(1) as f64
}
(false, Some(pred), false) => {
if let Some(stats) = self.predicates.get(pred) {
stats.triple_count as f64
} else {
base / self.predicate_count.max(1) as f64
}
}
(false, Some(pred), true) => {
if let Some(stats) = self.predicates.get(pred) {
stats.avg_subjects_per_object()
} else {
10.0
}
}
(false, None, true) => {
base / self.object_count.max(1) as f64
}
(false, None, false) => base,
}
}
pub fn estimate_join_selectivity(
&self,
var_position1: TriplePosition,
var_position2: TriplePosition,
) -> f64 {
let domain_size = match (var_position1, var_position2) {
(TriplePosition::Subject, TriplePosition::Subject) => self.subject_count,
(TriplePosition::Subject, TriplePosition::Object)
| (TriplePosition::Object, TriplePosition::Subject) => {
self.subject_count.max(self.object_count)
}
(TriplePosition::Object, TriplePosition::Object) => self.object_count,
_ => {
self.predicate_count
}
};
if domain_size == 0 {
return 1.0;
}
1.0 / domain_size as f64
}
pub fn estimate_filter_selectivity(&self, predicate_iri: Option<&str>) -> f64 {
if let Some(pred) = predicate_iri
&& let Some(stats) = self.predicates.get(pred)
{
if let Some(ref _hist) = stats.object_histogram {
return 0.33;
}
}
0.33 }
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TriplePosition {
Subject,
Predicate,
Object,
}
#[derive(Debug, Clone)]
pub struct PredicateStatistics {
pub triple_count: u64,
pub distinct_subjects: u64,
pub distinct_objects: u64,
pub is_functional: bool,
pub is_inverse_functional: bool,
pub object_type_distribution: HashMap<String, u64>,
pub object_histogram: Option<Histogram>,
}
impl PredicateStatistics {
pub fn new(triple_count: u64, distinct_subjects: u64, distinct_objects: u64) -> Self {
let is_functional = triple_count > 0 && triple_count == distinct_subjects;
let is_inverse_functional = triple_count > 0 && triple_count == distinct_objects;
Self {
triple_count,
distinct_subjects,
distinct_objects,
is_functional,
is_inverse_functional,
object_type_distribution: HashMap::new(),
object_histogram: None,
}
}
pub fn with_functional(mut self, functional: bool) -> Self {
self.is_functional = functional;
self
}
pub fn with_object_histogram(mut self, histogram: Histogram) -> Self {
self.object_histogram = Some(histogram);
self
}
pub fn with_object_types(mut self, types: HashMap<String, u64>) -> Self {
self.object_type_distribution = types;
self
}
pub fn avg_objects_per_subject(&self) -> f64 {
if self.distinct_subjects == 0 {
return 0.0;
}
self.triple_count as f64 / self.distinct_subjects as f64
}
pub fn avg_subjects_per_object(&self) -> f64 {
if self.distinct_objects == 0 {
return 0.0;
}
self.triple_count as f64 / self.distinct_objects as f64
}
pub fn object_equality_selectivity(&self, _value: &Value) -> f64 {
if let Some(ref hist) = self.object_histogram {
return hist.estimate_equality_selectivity(_value);
}
if self.distinct_objects == 0 {
return 1.0;
}
1.0 / self.distinct_objects as f64
}
}
#[derive(Debug, Clone, Default)]
pub struct IndexStatistics {
pub spo_lookup_cost: f64,
pub pos_lookup_cost: f64,
pub osp_lookup_cost: f64,
pub has_osp_index: bool,
}
impl IndexStatistics {
pub fn new() -> Self {
Self {
spo_lookup_cost: 1.0,
pos_lookup_cost: 1.5,
osp_lookup_cost: 2.0,
has_osp_index: true,
}
}
pub fn estimate_pattern_cost(
&self,
subject_bound: bool,
predicate_bound: bool,
object_bound: bool,
) -> f64 {
let bound_count =
u8::from(subject_bound) + u8::from(predicate_bound) + u8::from(object_bound);
match bound_count {
2..=3 => self.spo_lookup_cost,
1 => self.pos_lookup_cost,
_ => 10.0,
}
}
}
#[derive(Default)]
pub struct RdfStatisticsCollector {
triple_count: u64,
subjects: HashMap<String, u64>,
predicates: HashMap<String, PredicateCollector>,
objects: HashMap<String, u64>,
}
#[derive(Default)]
struct PredicateCollector {
count: u64,
subjects: HashMap<String, u64>,
objects: HashMap<String, u64>,
}
impl RdfStatisticsCollector {
pub fn new() -> Self {
Self::default()
}
pub fn record_triple(&mut self, subject: &str, predicate: &str, object: &str) {
self.triple_count += 1;
*self.subjects.entry(subject.to_string()).or_insert(0) += 1;
*self.objects.entry(object.to_string()).or_insert(0) += 1;
let pred_stats = self.predicates.entry(predicate.to_string()).or_default();
pred_stats.count += 1;
*pred_stats.subjects.entry(subject.to_string()).or_insert(0) += 1;
*pred_stats.objects.entry(object.to_string()).or_insert(0) += 1;
}
pub fn build(self) -> RdfStatistics {
let mut stats = RdfStatistics::new();
stats.total_triples = self.triple_count;
stats.subject_count = self.subjects.len() as u64;
stats.predicate_count = self.predicates.len() as u64;
stats.object_count = self.objects.len() as u64;
for (pred, collector) in self.predicates {
let pred_stats = PredicateStatistics::new(
collector.count,
collector.subjects.len() as u64,
collector.objects.len() as u64,
);
stats.predicates.insert(pred, pred_stats);
}
stats.index_stats = IndexStatistics::new();
stats
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_rdf_statistics_basic() {
let mut stats = RdfStatistics::new();
stats.update_from_counts(1000, 100, 50, 200);
let card = stats.estimate_triple_pattern_cardinality(true, None, false);
assert!(card > 0.0);
let full_card = stats.estimate_triple_pattern_cardinality(false, None, false);
assert_eq!(full_card, 1000.0);
}
#[test]
fn test_predicate_statistics() {
let pred_stats = PredicateStatistics::new(100, 50, 80);
assert_eq!(pred_stats.avg_objects_per_subject(), 2.0);
assert!(!pred_stats.is_functional);
}
#[test]
fn test_functional_predicate() {
let pred_stats = PredicateStatistics::new(100, 100, 100);
assert!(pred_stats.is_functional);
assert!(pred_stats.is_inverse_functional);
assert_eq!(pred_stats.avg_objects_per_subject(), 1.0);
}
#[test]
fn test_join_selectivity() {
let mut stats = RdfStatistics::new();
stats.update_from_counts(1000, 100, 50, 200);
let sel = stats.estimate_join_selectivity(TriplePosition::Subject, TriplePosition::Subject);
assert_eq!(sel, 0.01);
let sel = stats.estimate_join_selectivity(TriplePosition::Subject, TriplePosition::Object);
assert_eq!(sel, 1.0 / 200.0); }
#[test]
fn test_statistics_collector() {
let mut collector = RdfStatisticsCollector::new();
collector.record_triple("alix", "knows", "gus");
collector.record_triple("alix", "name", "Alix");
collector.record_triple("gus", "name", "Gus");
collector.record_triple("gus", "knows", "vincent");
let stats = collector.build();
assert_eq!(stats.total_triples, 4);
assert_eq!(stats.subject_count, 2); assert_eq!(stats.predicate_count, 2); assert_eq!(stats.object_count, 4); }
#[test]
fn test_pattern_cost_estimation() {
let index_stats = IndexStatistics::new();
let cost = index_stats.estimate_pattern_cost(true, true, false);
assert_eq!(cost, 1.0);
let cost = index_stats.estimate_pattern_cost(true, true, true);
assert_eq!(cost, 1.0);
let cost = index_stats.estimate_pattern_cost(true, false, false);
assert_eq!(cost, 1.5);
let cost = index_stats.estimate_pattern_cost(false, true, false);
assert_eq!(cost, 1.5);
let cost = index_stats.estimate_pattern_cost(false, false, false);
assert_eq!(cost, 10.0);
}
}