Skip to main content

grafeo_core/statistics/
rdf.rs

1//! RDF-specific statistics for SPARQL query optimization.
2//!
3//! SPARQL queries are built from triple patterns like `?person :knows ?friend`.
4//! To pick the best join order, the optimizer needs to estimate how many results
5//! each pattern will produce. This module tracks the distribution of subjects,
6//! predicates, and objects to make those estimates.
7
8use super::histogram::Histogram;
9use grafeo_common::types::Value;
10use std::collections::HashMap;
11
12/// Everything the SPARQL optimizer knows about your RDF data.
13///
14/// Use [`estimate_triple_pattern_cardinality()`](Self::estimate_triple_pattern_cardinality)
15/// to predict how many triples match a pattern like `?s :knows ?o`.
16#[derive(Debug, Clone, Default)]
17pub struct RdfStatistics {
18    /// Total number of triples.
19    pub total_triples: u64,
20    /// Number of unique subjects.
21    pub subject_count: u64,
22    /// Number of unique predicates.
23    pub predicate_count: u64,
24    /// Number of unique objects.
25    pub object_count: u64,
26
27    /// Per-predicate statistics.
28    pub predicates: HashMap<String, PredicateStatistics>,
29
30    /// Subject frequency histogram (for join selectivity).
31    pub subject_histogram: Option<Histogram>,
32    /// Object frequency histogram.
33    pub object_histogram: Option<Histogram>,
34
35    /// Index access pattern statistics (for cost model).
36    pub index_stats: IndexStatistics,
37}
38
39impl RdfStatistics {
40    /// Creates new empty RDF statistics.
41    pub fn new() -> Self {
42        Self::default()
43    }
44
45    /// Updates statistics from an RDF store.
46    pub fn update_from_counts(
47        &mut self,
48        total_triples: u64,
49        subject_count: u64,
50        predicate_count: u64,
51        object_count: u64,
52    ) {
53        self.total_triples = total_triples;
54        self.subject_count = subject_count;
55        self.predicate_count = predicate_count;
56        self.object_count = object_count;
57    }
58
59    /// Adds or updates predicate statistics.
60    pub fn update_predicate(&mut self, predicate: &str, stats: PredicateStatistics) {
61        self.predicates.insert(predicate.to_string(), stats);
62    }
63
64    /// Gets predicate statistics.
65    pub fn get_predicate(&self, predicate: &str) -> Option<&PredicateStatistics> {
66        self.predicates.get(predicate)
67    }
68
69    /// Estimates cardinality for a triple pattern.
70    ///
71    /// # Arguments
72    /// * `subject_bound` - Whether the subject is a constant
73    /// * `predicate_bound` - Whether the predicate is a constant (and its value if so)
74    /// * `object_bound` - Whether the object is a constant
75    pub fn estimate_triple_pattern_cardinality(
76        &self,
77        subject_bound: bool,
78        predicate_bound: Option<&str>,
79        object_bound: bool,
80    ) -> f64 {
81        if self.total_triples == 0 {
82            return 0.0;
83        }
84
85        let base = self.total_triples as f64;
86
87        match (subject_bound, predicate_bound, object_bound) {
88            // Fully bound pattern - either 0 or 1
89            (true, Some(_), true) => 1.0,
90
91            // Subject and predicate bound
92            (true, Some(pred), false) => {
93                if let Some(stats) = self.predicates.get(pred) {
94                    // Average objects per subject for this predicate
95                    stats.avg_objects_per_subject()
96                } else {
97                    // Default: assume 10 objects per subject
98                    10.0
99                }
100            }
101
102            // Subject and object bound (any predicate)
103            (true, None, true) => {
104                // Relatively rare - use predicate count as estimate
105                self.predicate_count as f64
106            }
107
108            // Only subject bound
109            (true, None, false) => {
110                // Average triples per subject
111                base / self.subject_count.max(1) as f64
112            }
113
114            // Only predicate bound
115            (false, Some(pred), false) => {
116                if let Some(stats) = self.predicates.get(pred) {
117                    stats.triple_count as f64
118                } else {
119                    base / self.predicate_count.max(1) as f64
120                }
121            }
122
123            // Predicate and object bound
124            (false, Some(pred), true) => {
125                if let Some(stats) = self.predicates.get(pred) {
126                    // Average subjects per object for this predicate
127                    stats.avg_subjects_per_object()
128                } else {
129                    10.0
130                }
131            }
132
133            // Only object bound
134            (false, None, true) => {
135                // Average triples per object
136                base / self.object_count.max(1) as f64
137            }
138
139            // No bindings - full scan
140            (false, None, false) => base,
141        }
142    }
143
144    /// Estimates join selectivity between two patterns sharing a variable.
145    pub fn estimate_join_selectivity(
146        &self,
147        var_position1: TriplePosition,
148        var_position2: TriplePosition,
149    ) -> f64 {
150        let domain_size = match (var_position1, var_position2) {
151            (TriplePosition::Subject, TriplePosition::Subject) => self.subject_count,
152            (TriplePosition::Subject, TriplePosition::Object)
153            | (TriplePosition::Object, TriplePosition::Subject) => {
154                // Subject-object join - use larger domain
155                self.subject_count.max(self.object_count)
156            }
157            (TriplePosition::Object, TriplePosition::Object) => self.object_count,
158            _ => {
159                // Joins involving predicates are rare and highly selective
160                self.predicate_count
161            }
162        };
163
164        if domain_size == 0 {
165            return 1.0;
166        }
167
168        // Join selectivity ≈ 1 / domain_size
169        1.0 / domain_size as f64
170    }
171
172    /// Estimates the cardinality after a FILTER operation.
173    pub fn estimate_filter_selectivity(&self, predicate_iri: Option<&str>) -> f64 {
174        // Default filter selectivity
175        if let Some(pred) = predicate_iri {
176            if let Some(stats) = self.predicates.get(pred) {
177                // Use predicate's object statistics for filter estimation
178                if let Some(ref _hist) = stats.object_histogram {
179                    // Assume filters reduce to ~33% of values
180                    return 0.33;
181                }
182            }
183        }
184        0.33 // Default filter selectivity
185    }
186}
187
188/// Which position in a triple pattern - subject, predicate, or object.
189#[derive(Debug, Clone, Copy, PartialEq, Eq)]
190pub enum TriplePosition {
191    /// Subject position.
192    Subject,
193    /// Predicate position.
194    Predicate,
195    /// Object position.
196    Object,
197}
198
199/// Statistics for a single predicate (like `:knows` or `:name`).
200#[derive(Debug, Clone)]
201pub struct PredicateStatistics {
202    /// Number of triples with this predicate.
203    pub triple_count: u64,
204    /// Number of unique subjects using this predicate.
205    pub distinct_subjects: u64,
206    /// Number of unique objects for this predicate.
207    pub distinct_objects: u64,
208    /// Whether this predicate is functional (1 object per subject).
209    pub is_functional: bool,
210    /// Whether this predicate is inverse functional (1 subject per object).
211    pub is_inverse_functional: bool,
212    /// Object type statistics (for typed literals).
213    pub object_type_distribution: HashMap<String, u64>,
214    /// Histogram of object values (for selective filters).
215    pub object_histogram: Option<Histogram>,
216}
217
218impl PredicateStatistics {
219    /// Creates new predicate statistics.
220    pub fn new(triple_count: u64, distinct_subjects: u64, distinct_objects: u64) -> Self {
221        let is_functional = triple_count > 0 && triple_count == distinct_subjects;
222        let is_inverse_functional = triple_count > 0 && triple_count == distinct_objects;
223
224        Self {
225            triple_count,
226            distinct_subjects,
227            distinct_objects,
228            is_functional,
229            is_inverse_functional,
230            object_type_distribution: HashMap::new(),
231            object_histogram: None,
232        }
233    }
234
235    /// Marks the predicate as functional.
236    pub fn with_functional(mut self, functional: bool) -> Self {
237        self.is_functional = functional;
238        self
239    }
240
241    /// Adds object histogram.
242    pub fn with_object_histogram(mut self, histogram: Histogram) -> Self {
243        self.object_histogram = Some(histogram);
244        self
245    }
246
247    /// Adds object type distribution.
248    pub fn with_object_types(mut self, types: HashMap<String, u64>) -> Self {
249        self.object_type_distribution = types;
250        self
251    }
252
253    /// Average number of objects per subject.
254    pub fn avg_objects_per_subject(&self) -> f64 {
255        if self.distinct_subjects == 0 {
256            return 0.0;
257        }
258        self.triple_count as f64 / self.distinct_subjects as f64
259    }
260
261    /// Average number of subjects per object.
262    pub fn avg_subjects_per_object(&self) -> f64 {
263        if self.distinct_objects == 0 {
264            return 0.0;
265        }
266        self.triple_count as f64 / self.distinct_objects as f64
267    }
268
269    /// Selectivity of a value equality filter on objects.
270    pub fn object_equality_selectivity(&self, _value: &Value) -> f64 {
271        if let Some(ref hist) = self.object_histogram {
272            // Use histogram for better estimate
273            return hist.estimate_equality_selectivity(_value);
274        }
275
276        // Fall back to uniform distribution
277        if self.distinct_objects == 0 {
278            return 1.0;
279        }
280        1.0 / self.distinct_objects as f64
281    }
282}
283
284/// Cost estimates for different index access patterns.
285///
286/// RDF stores typically have multiple indexes (SPO, POS, OSP). This tracks
287/// how expensive each one is to use, so the optimizer can pick the cheapest.
288#[derive(Debug, Clone, Default)]
289pub struct IndexStatistics {
290    /// Average cost of SPO index lookup (subject first).
291    pub spo_lookup_cost: f64,
292    /// Average cost of POS index lookup (predicate first).
293    pub pos_lookup_cost: f64,
294    /// Average cost of OSP index lookup (object first).
295    pub osp_lookup_cost: f64,
296    /// Whether OSP index is available.
297    pub has_osp_index: bool,
298}
299
300impl IndexStatistics {
301    /// Creates default index statistics.
302    pub fn new() -> Self {
303        Self {
304            spo_lookup_cost: 1.0,
305            pos_lookup_cost: 1.5,
306            osp_lookup_cost: 2.0,
307            has_osp_index: true,
308        }
309    }
310
311    /// Estimates the cost of executing a triple pattern.
312    pub fn estimate_pattern_cost(
313        &self,
314        subject_bound: bool,
315        predicate_bound: bool,
316        object_bound: bool,
317    ) -> f64 {
318        match (subject_bound, predicate_bound, object_bound) {
319            // Use SPO index
320            (true, _, _) => self.spo_lookup_cost,
321            // Use POS index
322            (false, true, _) => self.pos_lookup_cost,
323            // Use OSP index if available
324            (false, false, true) if self.has_osp_index => self.osp_lookup_cost,
325            // Full scan
326            _ => 10.0, // High cost for full scan
327        }
328    }
329}
330
331/// Streams triples through to build RDF statistics automatically.
332///
333/// Call [`record_triple()`](Self::record_triple) for each triple, then
334/// [`build()`](Self::build) to get the final [`RdfStatistics`].
335#[derive(Default)]
336pub struct RdfStatisticsCollector {
337    /// Total triple count.
338    triple_count: u64,
339    /// Subject occurrences.
340    subjects: HashMap<String, u64>,
341    /// Predicate occurrences.
342    predicates: HashMap<String, PredicateCollector>,
343    /// Object occurrences.
344    objects: HashMap<String, u64>,
345}
346
347/// Collector for per-predicate statistics.
348#[derive(Default)]
349struct PredicateCollector {
350    count: u64,
351    subjects: HashMap<String, u64>,
352    objects: HashMap<String, u64>,
353}
354
355impl RdfStatisticsCollector {
356    /// Creates a new statistics collector.
357    pub fn new() -> Self {
358        Self::default()
359    }
360
361    /// Records a triple.
362    pub fn record_triple(&mut self, subject: &str, predicate: &str, object: &str) {
363        self.triple_count += 1;
364
365        *self.subjects.entry(subject.to_string()).or_insert(0) += 1;
366        *self.objects.entry(object.to_string()).or_insert(0) += 1;
367
368        let pred_stats = self.predicates.entry(predicate.to_string()).or_default();
369        pred_stats.count += 1;
370        *pred_stats.subjects.entry(subject.to_string()).or_insert(0) += 1;
371        *pred_stats.objects.entry(object.to_string()).or_insert(0) += 1;
372    }
373
374    /// Builds the final statistics.
375    pub fn build(self) -> RdfStatistics {
376        let mut stats = RdfStatistics::new();
377
378        stats.total_triples = self.triple_count;
379        stats.subject_count = self.subjects.len() as u64;
380        stats.predicate_count = self.predicates.len() as u64;
381        stats.object_count = self.objects.len() as u64;
382
383        for (pred, collector) in self.predicates {
384            let pred_stats = PredicateStatistics::new(
385                collector.count,
386                collector.subjects.len() as u64,
387                collector.objects.len() as u64,
388            );
389            stats.predicates.insert(pred, pred_stats);
390        }
391
392        stats.index_stats = IndexStatistics::new();
393
394        stats
395    }
396}
397
398#[cfg(test)]
399mod tests {
400    use super::*;
401
402    #[test]
403    fn test_rdf_statistics_basic() {
404        let mut stats = RdfStatistics::new();
405        stats.update_from_counts(1000, 100, 50, 200);
406
407        // Test triple pattern cardinality estimation
408        let card = stats.estimate_triple_pattern_cardinality(true, None, false);
409        assert!(card > 0.0);
410
411        // Fully unbound should return total
412        let full_card = stats.estimate_triple_pattern_cardinality(false, None, false);
413        assert_eq!(full_card, 1000.0);
414    }
415
416    #[test]
417    fn test_predicate_statistics() {
418        let pred_stats = PredicateStatistics::new(100, 50, 80);
419
420        assert_eq!(pred_stats.avg_objects_per_subject(), 2.0);
421        assert!(!pred_stats.is_functional);
422    }
423
424    #[test]
425    fn test_functional_predicate() {
426        let pred_stats = PredicateStatistics::new(100, 100, 100);
427
428        assert!(pred_stats.is_functional);
429        assert!(pred_stats.is_inverse_functional);
430        assert_eq!(pred_stats.avg_objects_per_subject(), 1.0);
431    }
432
433    #[test]
434    fn test_join_selectivity() {
435        let mut stats = RdfStatistics::new();
436        stats.update_from_counts(1000, 100, 50, 200);
437
438        let sel = stats.estimate_join_selectivity(TriplePosition::Subject, TriplePosition::Subject);
439        assert_eq!(sel, 0.01); // 1/100
440
441        let sel = stats.estimate_join_selectivity(TriplePosition::Subject, TriplePosition::Object);
442        assert_eq!(sel, 1.0 / 200.0); // 1/max(100, 200)
443    }
444
445    #[test]
446    fn test_statistics_collector() {
447        let mut collector = RdfStatisticsCollector::new();
448
449        collector.record_triple("alice", "knows", "bob");
450        collector.record_triple("alice", "name", "Alice");
451        collector.record_triple("bob", "name", "Bob");
452        collector.record_triple("bob", "knows", "charlie");
453
454        let stats = collector.build();
455
456        assert_eq!(stats.total_triples, 4);
457        assert_eq!(stats.subject_count, 2); // alice, bob
458        assert_eq!(stats.predicate_count, 2); // knows, name
459        assert_eq!(stats.object_count, 4); // bob, Alice, Bob, charlie
460    }
461
462    #[test]
463    fn test_pattern_cost_estimation() {
464        let index_stats = IndexStatistics::new();
465
466        // Subject bound - cheapest
467        let cost = index_stats.estimate_pattern_cost(true, false, false);
468        assert_eq!(cost, 1.0);
469
470        // Predicate bound
471        let cost = index_stats.estimate_pattern_cost(false, true, false);
472        assert_eq!(cost, 1.5);
473
474        // Full scan - most expensive
475        let cost = index_stats.estimate_pattern_cost(false, false, false);
476        assert_eq!(cost, 10.0);
477    }
478}