use std::collections::HashMap;
#[derive(Debug, Clone, Default)]
pub struct GraphStatistics {
total_triples: u64,
distinct_subjects: u64,
distinct_predicates: u64,
distinct_objects: u64,
predicate_frequency: HashMap<String, u64>,
subject_frequency: HashMap<String, u64>,
object_frequency: HashMap<String, u64>,
}
impl GraphStatistics {
pub fn new() -> Self {
Self::default()
}
pub fn add_triple(&mut self, subject: &str, predicate: &str, object: &str) {
self.total_triples += 1;
let s_prev = *self.subject_frequency.get(subject).unwrap_or(&0);
if s_prev == 0 {
self.distinct_subjects += 1;
}
*self
.subject_frequency
.entry(subject.to_string())
.or_insert(0) += 1;
let p_prev = *self.predicate_frequency.get(predicate).unwrap_or(&0);
if p_prev == 0 {
self.distinct_predicates += 1;
}
*self
.predicate_frequency
.entry(predicate.to_string())
.or_insert(0) += 1;
let o_prev = *self.object_frequency.get(object).unwrap_or(&0);
if o_prev == 0 {
self.distinct_objects += 1;
}
*self.object_frequency.entry(object.to_string()).or_insert(0) += 1;
}
pub fn remove_triple(&mut self, subject: &str, predicate: &str, object: &str) {
if self.total_triples == 0 {
return;
}
self.total_triples = self.total_triples.saturating_sub(1);
if let Some(cnt) = self.subject_frequency.get_mut(subject) {
*cnt = cnt.saturating_sub(1);
if *cnt == 0 {
self.subject_frequency.remove(subject);
self.distinct_subjects = self.distinct_subjects.saturating_sub(1);
}
}
if let Some(cnt) = self.predicate_frequency.get_mut(predicate) {
*cnt = cnt.saturating_sub(1);
if *cnt == 0 {
self.predicate_frequency.remove(predicate);
self.distinct_predicates = self.distinct_predicates.saturating_sub(1);
}
}
if let Some(cnt) = self.object_frequency.get_mut(object) {
*cnt = cnt.saturating_sub(1);
if *cnt == 0 {
self.object_frequency.remove(object);
self.distinct_objects = self.distinct_objects.saturating_sub(1);
}
}
}
pub fn total_triples(&self) -> u64 {
self.total_triples
}
pub fn distinct_subjects(&self) -> u64 {
self.distinct_subjects
}
pub fn distinct_predicates(&self) -> u64 {
self.distinct_predicates
}
pub fn distinct_objects(&self) -> u64 {
self.distinct_objects
}
pub fn predicate_frequency(&self, predicate: &str) -> u64 {
*self.predicate_frequency.get(predicate).unwrap_or(&0)
}
pub fn top_predicates(&self, limit: usize) -> Vec<(String, u64)> {
let mut v: Vec<_> = self
.predicate_frequency
.iter()
.map(|(k, &v)| (k.clone(), v))
.collect();
v.sort_by(|a, b| b.1.cmp(&a.1));
v.truncate(limit);
v
}
pub fn estimate_cardinality(
&self,
subject: Option<&str>,
predicate: Option<&str>,
object: Option<&str>,
) -> u64 {
if self.total_triples == 0 {
return 0;
}
let n = self.total_triples as f64;
let s_sel = match subject {
None => 1.0,
Some(s) => {
let freq = *self.subject_frequency.get(s).unwrap_or(&0) as f64;
if freq == 0.0 {
0.0
} else {
freq / n
}
}
};
let p_sel = match predicate {
None => 1.0,
Some(p) => {
let freq = *self.predicate_frequency.get(p).unwrap_or(&0) as f64;
if freq == 0.0 {
0.0
} else {
freq / n
}
}
};
let o_sel = match object {
None => 1.0,
Some(o) => {
let freq = *self.object_frequency.get(o).unwrap_or(&0) as f64;
if freq == 0.0 {
0.0
} else {
freq / n
}
}
};
(n * s_sel * p_sel * o_sel).ceil() as u64
}
pub fn merge(&mut self, other: &GraphStatistics) {
self.total_triples += other.total_triples;
for (s, &cnt) in &other.subject_frequency {
let prev = *self.subject_frequency.get(s).unwrap_or(&0);
if prev == 0 {
self.distinct_subjects += 1;
}
*self.subject_frequency.entry(s.clone()).or_insert(0) += cnt;
}
for (p, &cnt) in &other.predicate_frequency {
let prev = *self.predicate_frequency.get(p).unwrap_or(&0);
if prev == 0 {
self.distinct_predicates += 1;
}
*self.predicate_frequency.entry(p.clone()).or_insert(0) += cnt;
}
for (o, &cnt) in &other.object_frequency {
let prev = *self.object_frequency.get(o).unwrap_or(&0);
if prev == 0 {
self.distinct_objects += 1;
}
*self.object_frequency.entry(o.clone()).or_insert(0) += cnt;
}
}
}
#[derive(Debug, Clone, Default)]
pub struct PredicateHistogram {
frequencies: HashMap<String, u64>,
total: u64,
}
impl PredicateHistogram {
pub fn new() -> Self {
Self::default()
}
pub fn record(&mut self, predicate: &str) {
*self.frequencies.entry(predicate.to_string()).or_insert(0) += 1;
self.total += 1;
}
pub fn remove(&mut self, predicate: &str) {
if let Some(cnt) = self.frequencies.get_mut(predicate) {
if *cnt > 0 {
*cnt -= 1;
self.total = self.total.saturating_sub(1);
if *cnt == 0 {
self.frequencies.remove(predicate);
}
}
}
}
pub fn frequency(&self, predicate: &str) -> u64 {
*self.frequencies.get(predicate).unwrap_or(&0)
}
pub fn relative_frequency(&self, predicate: &str) -> f64 {
if self.total == 0 {
0.0
} else {
*self.frequencies.get(predicate).unwrap_or(&0) as f64 / self.total as f64
}
}
pub fn total(&self) -> u64 {
self.total
}
pub fn distinct_count(&self) -> usize {
self.frequencies.len()
}
pub fn top_n(&self, n: usize) -> Vec<(String, u64)> {
let mut v: Vec<_> = self
.frequencies
.iter()
.map(|(k, &v)| (k.clone(), v))
.collect();
v.sort_by(|a, b| b.1.cmp(&a.1));
v.truncate(n);
v
}
pub fn bottom_n(&self, n: usize) -> Vec<(String, u64)> {
let mut v: Vec<_> = self
.frequencies
.iter()
.map(|(k, &v)| (k.clone(), v))
.collect();
v.sort_by(|a, b| a.1.cmp(&b.1));
v.truncate(n);
v
}
pub fn all_sorted(&self) -> Vec<(String, u64)> {
let mut v: Vec<_> = self
.frequencies
.iter()
.map(|(k, &v)| (k.clone(), v))
.collect();
v.sort_by(|a, b| a.0.cmp(&b.0));
v
}
}
#[derive(Debug, Clone, Default)]
pub struct CardinalityEstimator {
total_triples: u64,
distinct_subjects: u64,
distinct_objects: u64,
predicate_freq: HashMap<String, u64>,
}
impl CardinalityEstimator {
pub fn new() -> Self {
Self::default()
}
pub fn from_graph_stats(stats: &GraphStatistics) -> Self {
Self {
total_triples: stats.total_triples(),
distinct_subjects: stats.distinct_subjects(),
distinct_objects: stats.distinct_objects(),
predicate_freq: stats.predicate_frequency.clone(),
}
}
pub fn estimate_pattern(
&self,
subject_bound: bool,
predicate: Option<&str>,
object_bound: bool,
) -> u64 {
if self.total_triples == 0 {
return 0;
}
let n = self.total_triples as f64;
let mut sel = 1.0_f64;
if subject_bound {
let ds = self.distinct_subjects.max(1) as f64;
sel *= 1.0 / ds;
}
if let Some(p) = predicate {
let freq = *self.predicate_freq.get(p).unwrap_or(&0) as f64;
if freq == 0.0 {
return 1; }
sel *= freq / n;
}
if object_bound {
let dobj = self.distinct_objects.max(1) as f64;
sel *= 1.0 / dobj;
}
(n * sel).ceil() as u64
}
pub fn estimate_join(&self, left_card: u64, right_card: u64, shared_vars: usize) -> u64 {
if self.total_triples == 0 || left_card == 0 || right_card == 0 {
return 0;
}
let n = self.total_triples.max(1) as f64;
let reduction = n.powi(shared_vars as i32);
((left_card as f64 * right_card as f64 / reduction).ceil() as u64).max(1)
}
pub fn update_from(&mut self, stats: &GraphStatistics) {
self.total_triples = stats.total_triples();
self.distinct_subjects = stats.distinct_subjects();
self.distinct_objects = stats.distinct_objects();
self.predicate_freq = stats.predicate_frequency.clone();
}
}
#[derive(Debug, Clone)]
pub struct SampledStatistics {
sample_size: usize,
sample: Vec<(String, String, String)>,
total_seen: u64,
graph_stats: GraphStatistics,
}
impl SampledStatistics {
pub fn new(sample_size: usize) -> Self {
Self {
sample_size,
sample: Vec::with_capacity(sample_size),
total_seen: 0,
graph_stats: GraphStatistics::new(),
}
}
pub fn observe(&mut self, subject: &str, predicate: &str, object: &str) {
self.total_seen += 1;
self.graph_stats.add_triple(subject, predicate, object);
if self.sample.len() < self.sample_size {
self.sample.push((
subject.to_string(),
predicate.to_string(),
object.to_string(),
));
} else {
let hash = Self::hash3(subject, predicate, object, self.total_seen);
let idx = (hash % self.total_seen) as usize;
if idx < self.sample_size {
self.sample[idx] = (
subject.to_string(),
predicate.to_string(),
object.to_string(),
);
}
}
}
fn hash3(s: &str, p: &str, o: &str, n: u64) -> u64 {
const FNV_OFFSET: u64 = 14_695_981_039_346_656_037;
const FNV_PRIME: u64 = 1_099_511_628_211;
let mut h = FNV_OFFSET;
for b in s.bytes().chain(p.bytes()).chain(o.bytes()) {
h ^= b as u64;
h = h.wrapping_mul(FNV_PRIME);
}
h ^= n;
h = h.wrapping_mul(FNV_PRIME);
h
}
pub fn total_seen(&self) -> u64 {
self.total_seen
}
pub fn sample_size(&self) -> usize {
self.sample.len()
}
pub fn sample(&self) -> &[(String, String, String)] {
&self.sample
}
pub fn estimated_total(&self) -> u64 {
self.total_seen
}
pub fn estimated_distinct_predicates(&self) -> u64 {
let sample_distinct = self.graph_stats.distinct_predicates();
if self.sample.is_empty() || self.total_seen == 0 {
return sample_distinct;
}
let ratio = self.total_seen as f64 / self.sample.len() as f64;
(sample_distinct as f64 * ratio.sqrt()).ceil() as u64
}
pub fn estimate_cardinality(
&self,
subject: Option<&str>,
predicate: Option<&str>,
object: Option<&str>,
) -> u64 {
self.graph_stats
.estimate_cardinality(subject, predicate, object)
}
pub fn graph_stats(&self) -> &GraphStatistics {
&self.graph_stats
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_graph_stats_empty() {
let gs = GraphStatistics::new();
assert_eq!(gs.total_triples(), 0);
assert_eq!(gs.distinct_subjects(), 0);
assert_eq!(gs.distinct_predicates(), 0);
assert_eq!(gs.distinct_objects(), 0);
}
#[test]
fn test_graph_stats_add_single_triple() {
let mut gs = GraphStatistics::new();
gs.add_triple("alice", "knows", "bob");
assert_eq!(gs.total_triples(), 1);
assert_eq!(gs.distinct_subjects(), 1);
assert_eq!(gs.distinct_predicates(), 1);
assert_eq!(gs.distinct_objects(), 1);
}
#[test]
fn test_graph_stats_add_multiple_triples_same_predicate() {
let mut gs = GraphStatistics::new();
gs.add_triple("alice", "knows", "bob");
gs.add_triple("alice", "knows", "carol");
gs.add_triple("bob", "knows", "carol");
assert_eq!(gs.total_triples(), 3);
assert_eq!(gs.distinct_predicates(), 1);
assert_eq!(gs.predicate_frequency("knows"), 3);
}
#[test]
fn test_graph_stats_distinct_counts_correct() {
let mut gs = GraphStatistics::new();
gs.add_triple("s1", "p1", "o1");
gs.add_triple("s1", "p2", "o2");
gs.add_triple("s2", "p1", "o1");
assert_eq!(gs.distinct_subjects(), 2);
assert_eq!(gs.distinct_predicates(), 2);
assert_eq!(gs.distinct_objects(), 2);
}
#[test]
fn test_graph_stats_remove_triple() {
let mut gs = GraphStatistics::new();
gs.add_triple("s", "p", "o");
gs.remove_triple("s", "p", "o");
assert_eq!(gs.total_triples(), 0);
assert_eq!(gs.distinct_subjects(), 0);
assert_eq!(gs.distinct_predicates(), 0);
assert_eq!(gs.distinct_objects(), 0);
}
#[test]
fn test_graph_stats_remove_partial() {
let mut gs = GraphStatistics::new();
gs.add_triple("s", "p", "o1");
gs.add_triple("s", "p", "o2");
gs.remove_triple("s", "p", "o1");
assert_eq!(gs.total_triples(), 1);
assert_eq!(gs.distinct_subjects(), 1); assert_eq!(gs.distinct_objects(), 1); }
#[test]
fn test_graph_stats_remove_nonexistent_is_noop() {
let mut gs = GraphStatistics::new();
gs.remove_triple("nonexistent", "p", "o"); assert_eq!(gs.total_triples(), 0);
}
#[test]
fn test_graph_stats_estimate_cardinality_wildcard() {
let mut gs = GraphStatistics::new();
for i in 0..100 {
gs.add_triple(&format!("s{i}"), "p", "o");
}
let card = gs.estimate_cardinality(None, None, None);
assert_eq!(card, 100);
}
#[test]
fn test_graph_stats_estimate_cardinality_bound_predicate() {
let mut gs = GraphStatistics::new();
for i in 0..100 {
gs.add_triple(&format!("s{i}"), "p1", "o");
gs.add_triple(&format!("s{i}"), "p2", "o");
}
let card = gs.estimate_cardinality(None, Some("p1"), None);
assert!(card <= 100);
assert!(card >= 1);
}
#[test]
fn test_graph_stats_estimate_cardinality_unknown_predicate() {
let mut gs = GraphStatistics::new();
gs.add_triple("s", "p", "o");
let card = gs.estimate_cardinality(None, Some("nonexistent"), None);
assert_eq!(card, 0);
}
#[test]
fn test_graph_stats_top_predicates() {
let mut gs = GraphStatistics::new();
for _ in 0..10 {
gs.add_triple("s", "popular", "o");
}
for _ in 0..2 {
gs.add_triple("s", "rare", "o");
}
let top = gs.top_predicates(1);
assert_eq!(top[0].0, "popular");
assert_eq!(top[0].1, 10);
}
#[test]
fn test_graph_stats_merge() {
let mut gs1 = GraphStatistics::new();
gs1.add_triple("s1", "p", "o1");
let mut gs2 = GraphStatistics::new();
gs2.add_triple("s2", "p", "o2");
gs1.merge(&gs2);
assert_eq!(gs1.total_triples(), 2);
assert_eq!(gs1.distinct_subjects(), 2);
}
#[test]
fn test_predicate_histogram_empty() {
let ph = PredicateHistogram::new();
assert_eq!(ph.total(), 0);
assert_eq!(ph.distinct_count(), 0);
assert_eq!(ph.frequency("any"), 0);
}
#[test]
fn test_predicate_histogram_record() {
let mut ph = PredicateHistogram::new();
ph.record("p1");
ph.record("p1");
ph.record("p2");
assert_eq!(ph.total(), 3);
assert_eq!(ph.frequency("p1"), 2);
assert_eq!(ph.frequency("p2"), 1);
assert_eq!(ph.distinct_count(), 2);
}
#[test]
fn test_predicate_histogram_relative_frequency() {
let mut ph = PredicateHistogram::new();
for _ in 0..3 {
ph.record("a");
}
for _ in 0..7 {
ph.record("b");
}
let rf_a = ph.relative_frequency("a");
let rf_b = ph.relative_frequency("b");
assert!((rf_a - 0.3).abs() < 1e-9);
assert!((rf_b - 0.7).abs() < 1e-9);
}
#[test]
fn test_predicate_histogram_remove() {
let mut ph = PredicateHistogram::new();
ph.record("p");
ph.record("p");
ph.remove("p");
assert_eq!(ph.frequency("p"), 1);
ph.remove("p");
assert_eq!(ph.frequency("p"), 0);
assert_eq!(ph.distinct_count(), 0);
}
#[test]
fn test_predicate_histogram_top_n() {
let mut ph = PredicateHistogram::new();
for i in 0..5 {
for _ in 0..(i + 1) {
ph.record(&format!("p{i}"));
}
}
let top = ph.top_n(2);
assert_eq!(top.len(), 2);
assert!(top[0].1 >= top[1].1);
}
#[test]
fn test_predicate_histogram_bottom_n() {
let mut ph = PredicateHistogram::new();
for _ in 0..10 {
ph.record("common");
}
for _ in 0..1 {
ph.record("rare");
}
let bottom = ph.bottom_n(1);
assert_eq!(bottom[0].0, "rare");
}
#[test]
fn test_predicate_histogram_all_sorted() {
let mut ph = PredicateHistogram::new();
ph.record("zebra");
ph.record("apple");
ph.record("mango");
let sorted = ph.all_sorted();
assert_eq!(sorted[0].0, "apple");
assert_eq!(sorted[1].0, "mango");
assert_eq!(sorted[2].0, "zebra");
}
#[test]
fn test_cardinality_estimator_empty() {
let ce = CardinalityEstimator::new();
assert_eq!(ce.estimate_pattern(false, None, false), 0);
}
#[test]
fn test_cardinality_estimator_from_graph_stats() {
let mut gs = GraphStatistics::new();
for i in 0..50 {
gs.add_triple(&format!("s{i}"), "knows", "o");
}
let ce = CardinalityEstimator::from_graph_stats(&gs);
let card = ce.estimate_pattern(false, Some("knows"), false);
assert_eq!(card, 50);
}
#[test]
fn test_cardinality_estimator_bound_subject_reduces_card() {
let mut gs = GraphStatistics::new();
for i in 0..100 {
gs.add_triple(&format!("s{i}"), "p", "o");
}
let ce = CardinalityEstimator::from_graph_stats(&gs);
let card_with = ce.estimate_pattern(true, None, false);
let card_without = ce.estimate_pattern(false, None, false);
assert!(card_with <= card_without);
}
#[test]
fn test_cardinality_estimator_unknown_predicate() {
let mut gs = GraphStatistics::new();
gs.add_triple("s", "known_p", "o");
let ce = CardinalityEstimator::from_graph_stats(&gs);
let card = ce.estimate_pattern(false, Some("unknown_p"), false);
assert_eq!(card, 1); }
#[test]
fn test_cardinality_estimator_estimate_join() {
let mut gs = GraphStatistics::new();
for i in 0..100 {
gs.add_triple(&format!("s{i}"), "p", "o");
}
let ce = CardinalityEstimator::from_graph_stats(&gs);
let join_card = ce.estimate_join(50, 50, 1);
assert_eq!(join_card, 25);
}
#[test]
fn test_cardinality_estimator_join_zero() {
let ce = CardinalityEstimator::new();
assert_eq!(ce.estimate_join(10, 10, 0), 0);
}
#[test]
fn test_cardinality_estimator_update_from() {
let ce_before = CardinalityEstimator::new();
assert_eq!(ce_before.estimate_pattern(false, None, false), 0);
let mut gs = GraphStatistics::new();
gs.add_triple("s", "p", "o");
let mut ce = CardinalityEstimator::new();
ce.update_from(&gs);
assert_eq!(ce.estimate_pattern(false, None, false), 1);
}
#[test]
fn test_sampled_statistics_empty() {
let ss = SampledStatistics::new(100);
assert_eq!(ss.total_seen(), 0);
assert_eq!(ss.sample_size(), 0);
}
#[test]
fn test_sampled_statistics_observe_fewer_than_capacity() {
let mut ss = SampledStatistics::new(100);
ss.observe("s", "p", "o");
ss.observe("s2", "p2", "o2");
assert_eq!(ss.total_seen(), 2);
assert_eq!(ss.sample_size(), 2);
}
#[test]
fn test_sampled_statistics_observe_more_than_capacity() {
let mut ss = SampledStatistics::new(10);
for i in 0..50 {
ss.observe(&format!("s{i}"), "p", &format!("o{i}"));
}
assert_eq!(ss.total_seen(), 50);
assert_eq!(ss.sample_size(), 10);
}
#[test]
fn test_sampled_statistics_estimated_total() {
let mut ss = SampledStatistics::new(100);
for i in 0..200 {
ss.observe(&format!("s{i}"), "p", "o");
}
assert_eq!(ss.estimated_total(), 200);
}
#[test]
fn test_sampled_statistics_cardinality_from_sample() {
let mut ss = SampledStatistics::new(1000);
for i in 0..100 {
ss.observe(&format!("s{i}"), "p", "o");
}
let card = ss.estimate_cardinality(None, Some("p"), None);
assert!(card > 0);
}
#[test]
fn test_sampled_statistics_estimated_distinct_predicates() {
let mut ss = SampledStatistics::new(50);
for i in 0..5 {
ss.observe("s", &format!("p{i}"), "o");
}
let est = ss.estimated_distinct_predicates();
assert!(est >= 5);
}
#[test]
fn test_sampled_statistics_graph_stats_available() {
let mut ss = SampledStatistics::new(100);
ss.observe("alice", "knows", "bob");
let gs = ss.graph_stats();
assert_eq!(gs.total_triples(), 1);
}
}