Skip to main content

grafeo_core/statistics/
rdf.rs

1//! RDF-specific statistics for SPARQL query optimization.
2//!
3//! SPARQL queries are built from triple patterns like `?person :knows ?friend`.
4//! To pick the best join order, the optimizer needs to estimate how many results
5//! each pattern will produce. This module tracks the distribution of subjects,
6//! predicates, and objects to make those estimates.
7
8use super::histogram::Histogram;
9use grafeo_common::types::Value;
10use std::collections::HashMap;
11
12/// Everything the SPARQL optimizer knows about your RDF data.
13///
14/// Use [`estimate_triple_pattern_cardinality()`](Self::estimate_triple_pattern_cardinality)
15/// to predict how many triples match a pattern like `?s :knows ?o`.
16#[derive(Debug, Clone, Default)]
17pub struct RdfStatistics {
18    /// Total number of triples.
19    pub total_triples: u64,
20    /// Number of unique subjects.
21    pub subject_count: u64,
22    /// Number of unique predicates.
23    pub predicate_count: u64,
24    /// Number of unique objects.
25    pub object_count: u64,
26
27    /// Per-predicate statistics.
28    pub predicates: HashMap<String, PredicateStatistics>,
29
30    /// Subject frequency histogram (for join selectivity).
31    pub subject_histogram: Option<Histogram>,
32    /// Object frequency histogram.
33    pub object_histogram: Option<Histogram>,
34
35    /// Index access pattern statistics (for cost model).
36    pub index_stats: IndexStatistics,
37}
38
39impl RdfStatistics {
40    /// Creates new empty RDF statistics.
41    pub fn new() -> Self {
42        Self::default()
43    }
44
45    /// Updates statistics from an RDF store.
46    pub fn update_from_counts(
47        &mut self,
48        total_triples: u64,
49        subject_count: u64,
50        predicate_count: u64,
51        object_count: u64,
52    ) {
53        self.total_triples = total_triples;
54        self.subject_count = subject_count;
55        self.predicate_count = predicate_count;
56        self.object_count = object_count;
57    }
58
59    /// Adds or updates predicate statistics.
60    pub fn update_predicate(&mut self, predicate: &str, stats: PredicateStatistics) {
61        self.predicates.insert(predicate.to_string(), stats);
62    }
63
64    /// Gets predicate statistics.
65    pub fn get_predicate(&self, predicate: &str) -> Option<&PredicateStatistics> {
66        self.predicates.get(predicate)
67    }
68
69    /// Estimates cardinality for a triple pattern.
70    ///
71    /// # Arguments
72    /// * `subject_bound` - Whether the subject is a constant
73    /// * `predicate_bound` - Whether the predicate is a constant (and its value if so)
74    /// * `object_bound` - Whether the object is a constant
75    pub fn estimate_triple_pattern_cardinality(
76        &self,
77        subject_bound: bool,
78        predicate_bound: Option<&str>,
79        object_bound: bool,
80    ) -> f64 {
81        if self.total_triples == 0 {
82            return 0.0;
83        }
84
85        let base = self.total_triples as f64;
86
87        match (subject_bound, predicate_bound, object_bound) {
88            // Fully bound pattern - either 0 or 1
89            (true, Some(_), true) => 1.0,
90
91            // Subject and predicate bound
92            (true, Some(pred), false) => {
93                if let Some(stats) = self.predicates.get(pred) {
94                    // Average objects per subject for this predicate
95                    stats.avg_objects_per_subject()
96                } else {
97                    // Default: assume 10 objects per subject
98                    10.0
99                }
100            }
101
102            // Subject and object bound (any predicate)
103            (true, None, true) => {
104                // Relatively rare - use predicate count as estimate
105                self.predicate_count as f64
106            }
107
108            // Only subject bound
109            (true, None, false) => {
110                // Average triples per subject
111                base / self.subject_count.max(1) as f64
112            }
113
114            // Only predicate bound
115            (false, Some(pred), false) => {
116                if let Some(stats) = self.predicates.get(pred) {
117                    stats.triple_count as f64
118                } else {
119                    base / self.predicate_count.max(1) as f64
120                }
121            }
122
123            // Predicate and object bound
124            (false, Some(pred), true) => {
125                if let Some(stats) = self.predicates.get(pred) {
126                    // Average subjects per object for this predicate
127                    stats.avg_subjects_per_object()
128                } else {
129                    10.0
130                }
131            }
132
133            // Only object bound
134            (false, None, true) => {
135                // Average triples per object
136                base / self.object_count.max(1) as f64
137            }
138
139            // No bindings - full scan
140            (false, None, false) => base,
141        }
142    }
143
144    /// Estimates join selectivity between two patterns sharing a variable.
145    pub fn estimate_join_selectivity(
146        &self,
147        var_position1: TriplePosition,
148        var_position2: TriplePosition,
149    ) -> f64 {
150        let domain_size = match (var_position1, var_position2) {
151            (TriplePosition::Subject, TriplePosition::Subject) => self.subject_count,
152            (TriplePosition::Subject, TriplePosition::Object)
153            | (TriplePosition::Object, TriplePosition::Subject) => {
154                // Subject-object join - use larger domain
155                self.subject_count.max(self.object_count)
156            }
157            (TriplePosition::Object, TriplePosition::Object) => self.object_count,
158            _ => {
159                // Joins involving predicates are rare and highly selective
160                self.predicate_count
161            }
162        };
163
164        if domain_size == 0 {
165            return 1.0;
166        }
167
168        // Join selectivity ≈ 1 / domain_size
169        1.0 / domain_size as f64
170    }
171
172    /// Estimates the cardinality after a FILTER operation.
173    pub fn estimate_filter_selectivity(&self, predicate_iri: Option<&str>) -> f64 {
174        // Default filter selectivity
175        if let Some(pred) = predicate_iri
176            && let Some(stats) = self.predicates.get(pred)
177        {
178            // Use predicate's object statistics for filter estimation
179            if let Some(ref _hist) = stats.object_histogram {
180                // Assume filters reduce to ~33% of values
181                return 0.33;
182            }
183        }
184        0.33 // Default filter selectivity
185    }
186}
187
188/// Which position in a triple pattern - subject, predicate, or object.
189#[derive(Debug, Clone, Copy, PartialEq, Eq)]
190#[non_exhaustive]
191pub enum TriplePosition {
192    /// Subject position.
193    Subject,
194    /// Predicate position.
195    Predicate,
196    /// Object position.
197    Object,
198}
199
200/// Statistics for a single predicate (like `:knows` or `:name`).
201#[derive(Debug, Clone)]
202pub struct PredicateStatistics {
203    /// Number of triples with this predicate.
204    pub triple_count: u64,
205    /// Number of unique subjects using this predicate.
206    pub distinct_subjects: u64,
207    /// Number of unique objects for this predicate.
208    pub distinct_objects: u64,
209    /// Whether this predicate is functional (1 object per subject).
210    pub is_functional: bool,
211    /// Whether this predicate is inverse functional (1 subject per object).
212    pub is_inverse_functional: bool,
213    /// Object type statistics (for typed literals).
214    pub object_type_distribution: HashMap<String, u64>,
215    /// Histogram of object values (for selective filters).
216    pub object_histogram: Option<Histogram>,
217}
218
219impl PredicateStatistics {
220    /// Creates new predicate statistics.
221    pub fn new(triple_count: u64, distinct_subjects: u64, distinct_objects: u64) -> Self {
222        let is_functional = triple_count > 0 && triple_count == distinct_subjects;
223        let is_inverse_functional = triple_count > 0 && triple_count == distinct_objects;
224
225        Self {
226            triple_count,
227            distinct_subjects,
228            distinct_objects,
229            is_functional,
230            is_inverse_functional,
231            object_type_distribution: HashMap::new(),
232            object_histogram: None,
233        }
234    }
235
236    /// Marks the predicate as functional.
237    pub fn with_functional(mut self, functional: bool) -> Self {
238        self.is_functional = functional;
239        self
240    }
241
242    /// Adds object histogram.
243    pub fn with_object_histogram(mut self, histogram: Histogram) -> Self {
244        self.object_histogram = Some(histogram);
245        self
246    }
247
248    /// Adds object type distribution.
249    pub fn with_object_types(mut self, types: HashMap<String, u64>) -> Self {
250        self.object_type_distribution = types;
251        self
252    }
253
254    /// Average number of objects per subject.
255    pub fn avg_objects_per_subject(&self) -> f64 {
256        if self.distinct_subjects == 0 {
257            return 0.0;
258        }
259        self.triple_count as f64 / self.distinct_subjects as f64
260    }
261
262    /// Average number of subjects per object.
263    pub fn avg_subjects_per_object(&self) -> f64 {
264        if self.distinct_objects == 0 {
265            return 0.0;
266        }
267        self.triple_count as f64 / self.distinct_objects as f64
268    }
269
270    /// Selectivity of a value equality filter on objects.
271    pub fn object_equality_selectivity(&self, _value: &Value) -> f64 {
272        if let Some(ref hist) = self.object_histogram {
273            // Use histogram for better estimate
274            return hist.estimate_equality_selectivity(_value);
275        }
276
277        // Fall back to uniform distribution
278        if self.distinct_objects == 0 {
279            return 1.0;
280        }
281        1.0 / self.distinct_objects as f64
282    }
283}
284
285/// Cost estimates for different index access patterns.
286///
287/// RDF stores typically have multiple indexes (SPO, POS, OSP). This tracks
288/// how expensive each one is to use, so the optimizer can pick the cheapest.
289#[derive(Debug, Clone, Default)]
290pub struct IndexStatistics {
291    /// Average cost of SPO index lookup (subject first).
292    pub spo_lookup_cost: f64,
293    /// Average cost of POS index lookup (predicate first).
294    pub pos_lookup_cost: f64,
295    /// Average cost of OSP index lookup (object first).
296    pub osp_lookup_cost: f64,
297    /// Whether OSP index is available.
298    pub has_osp_index: bool,
299}
300
301impl IndexStatistics {
302    /// Creates default index statistics.
303    pub fn new() -> Self {
304        Self {
305            spo_lookup_cost: 1.0,
306            pos_lookup_cost: 1.5,
307            osp_lookup_cost: 2.0,
308            has_osp_index: true,
309        }
310    }
311
312    /// Estimates the cost of executing a triple pattern.
313    ///
314    /// With composite indexes (SP, PO, OS), any 2-bound or 3-bound query is a
315    /// direct hash lookup (cost 1.0). Single-bound queries use single-term
316    /// indexes (cost 1.5). Unbound patterns require a full scan (cost 10.0).
317    pub fn estimate_pattern_cost(
318        &self,
319        subject_bound: bool,
320        predicate_bound: bool,
321        object_bound: bool,
322    ) -> f64 {
323        let bound_count =
324            u8::from(subject_bound) + u8::from(predicate_bound) + u8::from(object_bound);
325        match bound_count {
326            // 3-bound or 2-bound: composite index lookup
327            2..=3 => self.spo_lookup_cost,
328            // 1-bound: single-term index
329            1 => self.pos_lookup_cost,
330            // 0-bound: full scan
331            _ => 10.0,
332        }
333    }
334}
335
336/// Streams triples through to build RDF statistics automatically.
337///
338/// Call [`record_triple()`](Self::record_triple) for each triple, then
339/// [`build()`](Self::build) to get the final [`RdfStatistics`].
340#[derive(Default)]
341pub struct RdfStatisticsCollector {
342    /// Total triple count.
343    triple_count: u64,
344    /// Subject occurrences.
345    subjects: HashMap<String, u64>,
346    /// Predicate occurrences.
347    predicates: HashMap<String, PredicateCollector>,
348    /// Object occurrences.
349    objects: HashMap<String, u64>,
350}
351
352/// Collector for per-predicate statistics.
353#[derive(Default)]
354struct PredicateCollector {
355    count: u64,
356    subjects: HashMap<String, u64>,
357    objects: HashMap<String, u64>,
358}
359
360impl RdfStatisticsCollector {
361    /// Creates a new statistics collector.
362    pub fn new() -> Self {
363        Self::default()
364    }
365
366    /// Records a triple.
367    pub fn record_triple(&mut self, subject: &str, predicate: &str, object: &str) {
368        self.triple_count += 1;
369
370        *self.subjects.entry(subject.to_string()).or_insert(0) += 1;
371        *self.objects.entry(object.to_string()).or_insert(0) += 1;
372
373        let pred_stats = self.predicates.entry(predicate.to_string()).or_default();
374        pred_stats.count += 1;
375        *pred_stats.subjects.entry(subject.to_string()).or_insert(0) += 1;
376        *pred_stats.objects.entry(object.to_string()).or_insert(0) += 1;
377    }
378
379    /// Builds the final statistics.
380    pub fn build(self) -> RdfStatistics {
381        let mut stats = RdfStatistics::new();
382
383        stats.total_triples = self.triple_count;
384        stats.subject_count = self.subjects.len() as u64;
385        stats.predicate_count = self.predicates.len() as u64;
386        stats.object_count = self.objects.len() as u64;
387
388        for (pred, collector) in self.predicates {
389            let pred_stats = PredicateStatistics::new(
390                collector.count,
391                collector.subjects.len() as u64,
392                collector.objects.len() as u64,
393            );
394            stats.predicates.insert(pred, pred_stats);
395        }
396
397        stats.index_stats = IndexStatistics::new();
398
399        stats
400    }
401}
402
403#[cfg(test)]
404mod tests {
405    use super::*;
406
407    #[test]
408    fn test_rdf_statistics_basic() {
409        let mut stats = RdfStatistics::new();
410        stats.update_from_counts(1000, 100, 50, 200);
411
412        // Test triple pattern cardinality estimation
413        let card = stats.estimate_triple_pattern_cardinality(true, None, false);
414        assert!(card > 0.0);
415
416        // Fully unbound should return total
417        let full_card = stats.estimate_triple_pattern_cardinality(false, None, false);
418        assert_eq!(full_card, 1000.0);
419    }
420
421    #[test]
422    fn test_predicate_statistics() {
423        let pred_stats = PredicateStatistics::new(100, 50, 80);
424
425        assert_eq!(pred_stats.avg_objects_per_subject(), 2.0);
426        assert!(!pred_stats.is_functional);
427    }
428
429    #[test]
430    fn test_functional_predicate() {
431        let pred_stats = PredicateStatistics::new(100, 100, 100);
432
433        assert!(pred_stats.is_functional);
434        assert!(pred_stats.is_inverse_functional);
435        assert_eq!(pred_stats.avg_objects_per_subject(), 1.0);
436    }
437
438    #[test]
439    fn test_join_selectivity() {
440        let mut stats = RdfStatistics::new();
441        stats.update_from_counts(1000, 100, 50, 200);
442
443        let sel = stats.estimate_join_selectivity(TriplePosition::Subject, TriplePosition::Subject);
444        assert_eq!(sel, 0.01); // 1/100
445
446        let sel = stats.estimate_join_selectivity(TriplePosition::Subject, TriplePosition::Object);
447        assert_eq!(sel, 1.0 / 200.0); // 1/max(100, 200)
448    }
449
450    #[test]
451    fn test_statistics_collector() {
452        let mut collector = RdfStatisticsCollector::new();
453
454        collector.record_triple("alix", "knows", "gus");
455        collector.record_triple("alix", "name", "Alix");
456        collector.record_triple("gus", "name", "Gus");
457        collector.record_triple("gus", "knows", "vincent");
458
459        let stats = collector.build();
460
461        assert_eq!(stats.total_triples, 4);
462        assert_eq!(stats.subject_count, 2); // alix, gus
463        assert_eq!(stats.predicate_count, 2); // knows, name
464        assert_eq!(stats.object_count, 4); // gus, Alix, Gus, vincent
465    }
466
467    #[test]
468    fn test_pattern_cost_estimation() {
469        let index_stats = IndexStatistics::new();
470
471        // 2-bound: composite index lookup (cheapest)
472        let cost = index_stats.estimate_pattern_cost(true, true, false);
473        assert_eq!(cost, 1.0);
474
475        // 3-bound: also composite index
476        let cost = index_stats.estimate_pattern_cost(true, true, true);
477        assert_eq!(cost, 1.0);
478
479        // 1-bound: single-term index
480        let cost = index_stats.estimate_pattern_cost(true, false, false);
481        assert_eq!(cost, 1.5);
482
483        let cost = index_stats.estimate_pattern_cost(false, true, false);
484        assert_eq!(cost, 1.5);
485
486        // Full scan: most expensive
487        let cost = index_stats.estimate_pattern_cost(false, false, false);
488        assert_eq!(cost, 10.0);
489    }
490}