Skip to main content

grafeo_core/statistics/
rdf.rs

1//! RDF-specific statistics for SPARQL query optimization.
2//!
3//! This module provides statistics structures specific to RDF triple stores,
4//! including predicate selectivity, triple pattern cardinality estimation,
5//! and join selectivity for SPARQL queries.
6
7use super::histogram::Histogram;
8use grafeo_common::types::Value;
9use std::collections::HashMap;
10
11/// Statistics for an RDF triple store.
12#[derive(Debug, Clone, Default)]
13pub struct RdfStatistics {
14    /// Total number of triples.
15    pub total_triples: u64,
16    /// Number of unique subjects.
17    pub subject_count: u64,
18    /// Number of unique predicates.
19    pub predicate_count: u64,
20    /// Number of unique objects.
21    pub object_count: u64,
22
23    /// Per-predicate statistics.
24    pub predicates: HashMap<String, PredicateStatistics>,
25
26    /// Subject frequency histogram (for join selectivity).
27    pub subject_histogram: Option<Histogram>,
28    /// Object frequency histogram.
29    pub object_histogram: Option<Histogram>,
30
31    /// Index access pattern statistics (for cost model).
32    pub index_stats: IndexStatistics,
33}
34
35impl RdfStatistics {
36    /// Creates new empty RDF statistics.
37    pub fn new() -> Self {
38        Self::default()
39    }
40
41    /// Updates statistics from an RDF store.
42    pub fn update_from_counts(
43        &mut self,
44        total_triples: u64,
45        subject_count: u64,
46        predicate_count: u64,
47        object_count: u64,
48    ) {
49        self.total_triples = total_triples;
50        self.subject_count = subject_count;
51        self.predicate_count = predicate_count;
52        self.object_count = object_count;
53    }
54
55    /// Adds or updates predicate statistics.
56    pub fn update_predicate(&mut self, predicate: &str, stats: PredicateStatistics) {
57        self.predicates.insert(predicate.to_string(), stats);
58    }
59
60    /// Gets predicate statistics.
61    pub fn get_predicate(&self, predicate: &str) -> Option<&PredicateStatistics> {
62        self.predicates.get(predicate)
63    }
64
65    /// Estimates cardinality for a triple pattern.
66    ///
67    /// # Arguments
68    /// * `subject_bound` - Whether the subject is a constant
69    /// * `predicate_bound` - Whether the predicate is a constant (and its value if so)
70    /// * `object_bound` - Whether the object is a constant
71    pub fn estimate_triple_pattern_cardinality(
72        &self,
73        subject_bound: bool,
74        predicate_bound: Option<&str>,
75        object_bound: bool,
76    ) -> f64 {
77        if self.total_triples == 0 {
78            return 0.0;
79        }
80
81        let base = self.total_triples as f64;
82
83        match (subject_bound, predicate_bound, object_bound) {
84            // Fully bound pattern - either 0 or 1
85            (true, Some(_), true) => 1.0,
86
87            // Subject and predicate bound
88            (true, Some(pred), false) => {
89                if let Some(stats) = self.predicates.get(pred) {
90                    // Average objects per subject for this predicate
91                    stats.avg_objects_per_subject()
92                } else {
93                    // Default: assume 10 objects per subject
94                    10.0
95                }
96            }
97
98            // Subject and object bound (any predicate)
99            (true, None, true) => {
100                // Relatively rare - use predicate count as estimate
101                self.predicate_count as f64
102            }
103
104            // Only subject bound
105            (true, None, false) => {
106                // Average triples per subject
107                base / self.subject_count.max(1) as f64
108            }
109
110            // Only predicate bound
111            (false, Some(pred), false) => {
112                if let Some(stats) = self.predicates.get(pred) {
113                    stats.triple_count as f64
114                } else {
115                    base / self.predicate_count.max(1) as f64
116                }
117            }
118
119            // Predicate and object bound
120            (false, Some(pred), true) => {
121                if let Some(stats) = self.predicates.get(pred) {
122                    // Average subjects per object for this predicate
123                    stats.avg_subjects_per_object()
124                } else {
125                    10.0
126                }
127            }
128
129            // Only object bound
130            (false, None, true) => {
131                // Average triples per object
132                base / self.object_count.max(1) as f64
133            }
134
135            // No bindings - full scan
136            (false, None, false) => base,
137        }
138    }
139
140    /// Estimates join selectivity between two patterns sharing a variable.
141    pub fn estimate_join_selectivity(
142        &self,
143        var_position1: TriplePosition,
144        var_position2: TriplePosition,
145    ) -> f64 {
146        let domain_size = match (var_position1, var_position2) {
147            (TriplePosition::Subject, TriplePosition::Subject) => self.subject_count,
148            (TriplePosition::Subject, TriplePosition::Object)
149            | (TriplePosition::Object, TriplePosition::Subject) => {
150                // Subject-object join - use larger domain
151                self.subject_count.max(self.object_count)
152            }
153            (TriplePosition::Object, TriplePosition::Object) => self.object_count,
154            _ => {
155                // Joins involving predicates are rare and highly selective
156                self.predicate_count
157            }
158        };
159
160        if domain_size == 0 {
161            return 1.0;
162        }
163
164        // Join selectivity ≈ 1 / domain_size
165        1.0 / domain_size as f64
166    }
167
168    /// Estimates the cardinality after a FILTER operation.
169    pub fn estimate_filter_selectivity(&self, predicate_iri: Option<&str>) -> f64 {
170        // Default filter selectivity
171        if let Some(pred) = predicate_iri {
172            if let Some(stats) = self.predicates.get(pred) {
173                // Use predicate's object statistics for filter estimation
174                if let Some(ref _hist) = stats.object_histogram {
175                    // Assume filters reduce to ~33% of values
176                    return 0.33;
177                }
178            }
179        }
180        0.33 // Default filter selectivity
181    }
182}
183
184/// Position in a triple pattern.
185#[derive(Debug, Clone, Copy, PartialEq, Eq)]
186pub enum TriplePosition {
187    /// Subject position.
188    Subject,
189    /// Predicate position.
190    Predicate,
191    /// Object position.
192    Object,
193}
194
195/// Statistics for a specific predicate (property).
196#[derive(Debug, Clone)]
197pub struct PredicateStatistics {
198    /// Number of triples with this predicate.
199    pub triple_count: u64,
200    /// Number of unique subjects using this predicate.
201    pub distinct_subjects: u64,
202    /// Number of unique objects for this predicate.
203    pub distinct_objects: u64,
204    /// Whether this predicate is functional (1 object per subject).
205    pub is_functional: bool,
206    /// Whether this predicate is inverse functional (1 subject per object).
207    pub is_inverse_functional: bool,
208    /// Object type statistics (for typed literals).
209    pub object_type_distribution: HashMap<String, u64>,
210    /// Histogram of object values (for selective filters).
211    pub object_histogram: Option<Histogram>,
212}
213
214impl PredicateStatistics {
215    /// Creates new predicate statistics.
216    pub fn new(triple_count: u64, distinct_subjects: u64, distinct_objects: u64) -> Self {
217        let is_functional = triple_count > 0 && triple_count == distinct_subjects;
218        let is_inverse_functional = triple_count > 0 && triple_count == distinct_objects;
219
220        Self {
221            triple_count,
222            distinct_subjects,
223            distinct_objects,
224            is_functional,
225            is_inverse_functional,
226            object_type_distribution: HashMap::new(),
227            object_histogram: None,
228        }
229    }
230
231    /// Marks the predicate as functional.
232    pub fn with_functional(mut self, functional: bool) -> Self {
233        self.is_functional = functional;
234        self
235    }
236
237    /// Adds object histogram.
238    pub fn with_object_histogram(mut self, histogram: Histogram) -> Self {
239        self.object_histogram = Some(histogram);
240        self
241    }
242
243    /// Adds object type distribution.
244    pub fn with_object_types(mut self, types: HashMap<String, u64>) -> Self {
245        self.object_type_distribution = types;
246        self
247    }
248
249    /// Average number of objects per subject.
250    pub fn avg_objects_per_subject(&self) -> f64 {
251        if self.distinct_subjects == 0 {
252            return 0.0;
253        }
254        self.triple_count as f64 / self.distinct_subjects as f64
255    }
256
257    /// Average number of subjects per object.
258    pub fn avg_subjects_per_object(&self) -> f64 {
259        if self.distinct_objects == 0 {
260            return 0.0;
261        }
262        self.triple_count as f64 / self.distinct_objects as f64
263    }
264
265    /// Selectivity of a value equality filter on objects.
266    pub fn object_equality_selectivity(&self, _value: &Value) -> f64 {
267        if let Some(ref hist) = self.object_histogram {
268            // Use histogram for better estimate
269            return hist.estimate_equality_selectivity(_value);
270        }
271
272        // Fall back to uniform distribution
273        if self.distinct_objects == 0 {
274            return 1.0;
275        }
276        1.0 / self.distinct_objects as f64
277    }
278}
279
280/// Statistics about index access patterns for cost estimation.
281#[derive(Debug, Clone, Default)]
282pub struct IndexStatistics {
283    /// Average cost of SPO index lookup (subject first).
284    pub spo_lookup_cost: f64,
285    /// Average cost of POS index lookup (predicate first).
286    pub pos_lookup_cost: f64,
287    /// Average cost of OSP index lookup (object first).
288    pub osp_lookup_cost: f64,
289    /// Whether OSP index is available.
290    pub has_osp_index: bool,
291}
292
293impl IndexStatistics {
294    /// Creates default index statistics.
295    pub fn new() -> Self {
296        Self {
297            spo_lookup_cost: 1.0,
298            pos_lookup_cost: 1.5,
299            osp_lookup_cost: 2.0,
300            has_osp_index: true,
301        }
302    }
303
304    /// Estimates the cost of executing a triple pattern.
305    pub fn estimate_pattern_cost(
306        &self,
307        subject_bound: bool,
308        predicate_bound: bool,
309        object_bound: bool,
310    ) -> f64 {
311        match (subject_bound, predicate_bound, object_bound) {
312            // Use SPO index
313            (true, _, _) => self.spo_lookup_cost,
314            // Use POS index
315            (false, true, _) => self.pos_lookup_cost,
316            // Use OSP index if available
317            (false, false, true) if self.has_osp_index => self.osp_lookup_cost,
318            // Full scan
319            _ => 10.0, // High cost for full scan
320        }
321    }
322}
323
324/// Collector for building RDF statistics from a triple store.
325#[derive(Default)]
326pub struct RdfStatisticsCollector {
327    /// Total triple count.
328    triple_count: u64,
329    /// Subject occurrences.
330    subjects: HashMap<String, u64>,
331    /// Predicate occurrences.
332    predicates: HashMap<String, PredicateCollector>,
333    /// Object occurrences.
334    objects: HashMap<String, u64>,
335}
336
337/// Collector for per-predicate statistics.
338#[derive(Default)]
339struct PredicateCollector {
340    count: u64,
341    subjects: HashMap<String, u64>,
342    objects: HashMap<String, u64>,
343}
344
345impl RdfStatisticsCollector {
346    /// Creates a new statistics collector.
347    pub fn new() -> Self {
348        Self::default()
349    }
350
351    /// Records a triple.
352    pub fn record_triple(&mut self, subject: &str, predicate: &str, object: &str) {
353        self.triple_count += 1;
354
355        *self.subjects.entry(subject.to_string()).or_insert(0) += 1;
356        *self.objects.entry(object.to_string()).or_insert(0) += 1;
357
358        let pred_stats = self.predicates.entry(predicate.to_string()).or_default();
359        pred_stats.count += 1;
360        *pred_stats.subjects.entry(subject.to_string()).or_insert(0) += 1;
361        *pred_stats.objects.entry(object.to_string()).or_insert(0) += 1;
362    }
363
364    /// Builds the final statistics.
365    pub fn build(self) -> RdfStatistics {
366        let mut stats = RdfStatistics::new();
367
368        stats.total_triples = self.triple_count;
369        stats.subject_count = self.subjects.len() as u64;
370        stats.predicate_count = self.predicates.len() as u64;
371        stats.object_count = self.objects.len() as u64;
372
373        for (pred, collector) in self.predicates {
374            let pred_stats = PredicateStatistics::new(
375                collector.count,
376                collector.subjects.len() as u64,
377                collector.objects.len() as u64,
378            );
379            stats.predicates.insert(pred, pred_stats);
380        }
381
382        stats.index_stats = IndexStatistics::new();
383
384        stats
385    }
386}
387
388#[cfg(test)]
389mod tests {
390    use super::*;
391
392    #[test]
393    fn test_rdf_statistics_basic() {
394        let mut stats = RdfStatistics::new();
395        stats.update_from_counts(1000, 100, 50, 200);
396
397        // Test triple pattern cardinality estimation
398        let card = stats.estimate_triple_pattern_cardinality(true, None, false);
399        assert!(card > 0.0);
400
401        // Fully unbound should return total
402        let full_card = stats.estimate_triple_pattern_cardinality(false, None, false);
403        assert_eq!(full_card, 1000.0);
404    }
405
406    #[test]
407    fn test_predicate_statistics() {
408        let pred_stats = PredicateStatistics::new(100, 50, 80);
409
410        assert_eq!(pred_stats.avg_objects_per_subject(), 2.0);
411        assert!(!pred_stats.is_functional);
412    }
413
414    #[test]
415    fn test_functional_predicate() {
416        let pred_stats = PredicateStatistics::new(100, 100, 100);
417
418        assert!(pred_stats.is_functional);
419        assert!(pred_stats.is_inverse_functional);
420        assert_eq!(pred_stats.avg_objects_per_subject(), 1.0);
421    }
422
423    #[test]
424    fn test_join_selectivity() {
425        let mut stats = RdfStatistics::new();
426        stats.update_from_counts(1000, 100, 50, 200);
427
428        let sel = stats.estimate_join_selectivity(TriplePosition::Subject, TriplePosition::Subject);
429        assert_eq!(sel, 0.01); // 1/100
430
431        let sel = stats.estimate_join_selectivity(TriplePosition::Subject, TriplePosition::Object);
432        assert_eq!(sel, 1.0 / 200.0); // 1/max(100, 200)
433    }
434
435    #[test]
436    fn test_statistics_collector() {
437        let mut collector = RdfStatisticsCollector::new();
438
439        collector.record_triple("alice", "knows", "bob");
440        collector.record_triple("alice", "name", "Alice");
441        collector.record_triple("bob", "name", "Bob");
442        collector.record_triple("bob", "knows", "charlie");
443
444        let stats = collector.build();
445
446        assert_eq!(stats.total_triples, 4);
447        assert_eq!(stats.subject_count, 2); // alice, bob
448        assert_eq!(stats.predicate_count, 2); // knows, name
449        assert_eq!(stats.object_count, 4); // bob, Alice, Bob, charlie
450    }
451
452    #[test]
453    fn test_pattern_cost_estimation() {
454        let index_stats = IndexStatistics::new();
455
456        // Subject bound - cheapest
457        let cost = index_stats.estimate_pattern_cost(true, false, false);
458        assert_eq!(cost, 1.0);
459
460        // Predicate bound
461        let cost = index_stats.estimate_pattern_cost(false, true, false);
462        assert_eq!(cost, 1.5);
463
464        // Full scan - most expensive
465        let cost = index_stats.estimate_pattern_cost(false, false, false);
466        assert_eq!(cost, 10.0);
467    }
468}