oxirs_core/statistics/
graph_stats.rs

1//! High-level RDF graph statistics for query planning.
2//!
3//! Provides:
4//! - [`GraphStatistics`]: Triple count, distinct subject/predicate/object counts.
5//! - `PredicateFrequency`: Frequency distribution of predicates in the graph.
6//! - [`CardinalityEstimator`]: Estimates join cardinality from histograms.
7//! - [`SampledStatistics`]: Statistical sampling for large graphs.
8
9use std::collections::HashMap;
10
11// ---------------------------------------------------------------------------
12// GraphStatistics
13// ---------------------------------------------------------------------------
14
15/// Core statistics about an RDF graph.
16///
17/// Tracks total triple count and the number of distinct subjects, predicates, and objects.
18/// Supports incremental updates as triples are added or removed.
19#[derive(Debug, Clone, Default)]
20pub struct GraphStatistics {
21    total_triples: u64,
22    distinct_subjects: u64,
23    distinct_predicates: u64,
24    distinct_objects: u64,
25    predicate_frequency: HashMap<String, u64>,
26    subject_frequency: HashMap<String, u64>,
27    object_frequency: HashMap<String, u64>,
28}
29
30impl GraphStatistics {
31    /// Create a new, empty statistics instance.
32    pub fn new() -> Self {
33        Self::default()
34    }
35
36    /// Record that a triple `(subject, predicate, object)` was added.
37    pub fn add_triple(&mut self, subject: &str, predicate: &str, object: &str) {
38        self.total_triples += 1;
39        let s_prev = *self.subject_frequency.get(subject).unwrap_or(&0);
40        if s_prev == 0 {
41            self.distinct_subjects += 1;
42        }
43        *self
44            .subject_frequency
45            .entry(subject.to_string())
46            .or_insert(0) += 1;
47
48        let p_prev = *self.predicate_frequency.get(predicate).unwrap_or(&0);
49        if p_prev == 0 {
50            self.distinct_predicates += 1;
51        }
52        *self
53            .predicate_frequency
54            .entry(predicate.to_string())
55            .or_insert(0) += 1;
56
57        let o_prev = *self.object_frequency.get(object).unwrap_or(&0);
58        if o_prev == 0 {
59            self.distinct_objects += 1;
60        }
61        *self.object_frequency.entry(object.to_string()).or_insert(0) += 1;
62    }
63
64    /// Record that a triple was removed.  If the triple was never added, this is a no-op.
65    pub fn remove_triple(&mut self, subject: &str, predicate: &str, object: &str) {
66        if self.total_triples == 0 {
67            return;
68        }
69        self.total_triples = self.total_triples.saturating_sub(1);
70
71        if let Some(cnt) = self.subject_frequency.get_mut(subject) {
72            *cnt = cnt.saturating_sub(1);
73            if *cnt == 0 {
74                self.subject_frequency.remove(subject);
75                self.distinct_subjects = self.distinct_subjects.saturating_sub(1);
76            }
77        }
78
79        if let Some(cnt) = self.predicate_frequency.get_mut(predicate) {
80            *cnt = cnt.saturating_sub(1);
81            if *cnt == 0 {
82                self.predicate_frequency.remove(predicate);
83                self.distinct_predicates = self.distinct_predicates.saturating_sub(1);
84            }
85        }
86
87        if let Some(cnt) = self.object_frequency.get_mut(object) {
88            *cnt = cnt.saturating_sub(1);
89            if *cnt == 0 {
90                self.object_frequency.remove(object);
91                self.distinct_objects = self.distinct_objects.saturating_sub(1);
92            }
93        }
94    }
95
96    /// Total number of triples.
97    pub fn total_triples(&self) -> u64 {
98        self.total_triples
99    }
100
101    /// Number of distinct subjects.
102    pub fn distinct_subjects(&self) -> u64 {
103        self.distinct_subjects
104    }
105
106    /// Number of distinct predicates.
107    pub fn distinct_predicates(&self) -> u64 {
108        self.distinct_predicates
109    }
110
111    /// Number of distinct objects.
112    pub fn distinct_objects(&self) -> u64 {
113        self.distinct_objects
114    }
115
116    /// Frequency of `predicate` (number of triples with that predicate).
117    pub fn predicate_frequency(&self, predicate: &str) -> u64 {
118        *self.predicate_frequency.get(predicate).unwrap_or(&0)
119    }
120
121    /// All predicates sorted by frequency (most frequent first).
122    pub fn top_predicates(&self, limit: usize) -> Vec<(String, u64)> {
123        let mut v: Vec<_> = self
124            .predicate_frequency
125            .iter()
126            .map(|(k, &v)| (k.clone(), v))
127            .collect();
128        v.sort_by_key(|(_, count)| std::cmp::Reverse(*count));
129        v.truncate(limit);
130        v
131    }
132
133    /// Estimate the number of triples for a given triple pattern.
134    ///
135    /// `None` in a slot means "wildcard".  The estimate uses frequency data where available
136    /// and falls back to uniform distribution otherwise.
137    pub fn estimate_cardinality(
138        &self,
139        subject: Option<&str>,
140        predicate: Option<&str>,
141        object: Option<&str>,
142    ) -> u64 {
143        if self.total_triples == 0 {
144            return 0;
145        }
146        let n = self.total_triples as f64;
147
148        let s_sel = match subject {
149            None => 1.0,
150            Some(s) => {
151                let freq = *self.subject_frequency.get(s).unwrap_or(&0) as f64;
152                if freq == 0.0 {
153                    0.0
154                } else {
155                    freq / n
156                }
157            }
158        };
159
160        let p_sel = match predicate {
161            None => 1.0,
162            Some(p) => {
163                let freq = *self.predicate_frequency.get(p).unwrap_or(&0) as f64;
164                if freq == 0.0 {
165                    0.0
166                } else {
167                    freq / n
168                }
169            }
170        };
171
172        let o_sel = match object {
173            None => 1.0,
174            Some(o) => {
175                let freq = *self.object_frequency.get(o).unwrap_or(&0) as f64;
176                if freq == 0.0 {
177                    0.0
178                } else {
179                    freq / n
180                }
181            }
182        };
183
184        (n * s_sel * p_sel * o_sel).ceil() as u64
185    }
186
187    /// Merge another `GraphStatistics` instance into this one (union semantics — no dedup).
188    pub fn merge(&mut self, other: &GraphStatistics) {
189        self.total_triples += other.total_triples;
190        for (s, &cnt) in &other.subject_frequency {
191            let prev = *self.subject_frequency.get(s).unwrap_or(&0);
192            if prev == 0 {
193                self.distinct_subjects += 1;
194            }
195            *self.subject_frequency.entry(s.clone()).or_insert(0) += cnt;
196        }
197        for (p, &cnt) in &other.predicate_frequency {
198            let prev = *self.predicate_frequency.get(p).unwrap_or(&0);
199            if prev == 0 {
200                self.distinct_predicates += 1;
201            }
202            *self.predicate_frequency.entry(p.clone()).or_insert(0) += cnt;
203        }
204        for (o, &cnt) in &other.object_frequency {
205            let prev = *self.object_frequency.get(o).unwrap_or(&0);
206            if prev == 0 {
207                self.distinct_objects += 1;
208            }
209            *self.object_frequency.entry(o.clone()).or_insert(0) += cnt;
210        }
211    }
212}
213
214// ---------------------------------------------------------------------------
215// PredicateHistogram
216// ---------------------------------------------------------------------------
217
218/// Frequency distribution of predicates in an RDF graph.
219///
220/// Maps each predicate IRI to the count of triples that use it.
221#[derive(Debug, Clone, Default)]
222pub struct PredicateHistogram {
223    frequencies: HashMap<String, u64>,
224    total: u64,
225}
226
227impl PredicateHistogram {
228    pub fn new() -> Self {
229        Self::default()
230    }
231
232    /// Record a triple with this predicate.
233    pub fn record(&mut self, predicate: &str) {
234        *self.frequencies.entry(predicate.to_string()).or_insert(0) += 1;
235        self.total += 1;
236    }
237
238    /// Remove a triple with this predicate.
239    pub fn remove(&mut self, predicate: &str) {
240        if let Some(cnt) = self.frequencies.get_mut(predicate) {
241            if *cnt > 0 {
242                *cnt -= 1;
243                self.total = self.total.saturating_sub(1);
244                if *cnt == 0 {
245                    self.frequencies.remove(predicate);
246                }
247            }
248        }
249    }
250
251    /// Frequency (triple count) for `predicate`.
252    pub fn frequency(&self, predicate: &str) -> u64 {
253        *self.frequencies.get(predicate).unwrap_or(&0)
254    }
255
256    /// Relative frequency (fraction of all triples) for `predicate`.
257    pub fn relative_frequency(&self, predicate: &str) -> f64 {
258        if self.total == 0 {
259            0.0
260        } else {
261            *self.frequencies.get(predicate).unwrap_or(&0) as f64 / self.total as f64
262        }
263    }
264
265    /// Total triple count across all predicates.
266    pub fn total(&self) -> u64 {
267        self.total
268    }
269
270    /// Number of distinct predicates.
271    pub fn distinct_count(&self) -> usize {
272        self.frequencies.len()
273    }
274
275    /// Top-N predicates by frequency.
276    pub fn top_n(&self, n: usize) -> Vec<(String, u64)> {
277        let mut v: Vec<_> = self
278            .frequencies
279            .iter()
280            .map(|(k, &v)| (k.clone(), v))
281            .collect();
282        v.sort_by_key(|(_, count)| std::cmp::Reverse(*count));
283        v.truncate(n);
284        v
285    }
286
287    /// Bottom-N predicates by frequency (least used).
288    pub fn bottom_n(&self, n: usize) -> Vec<(String, u64)> {
289        let mut v: Vec<_> = self
290            .frequencies
291            .iter()
292            .map(|(k, &v)| (k.clone(), v))
293            .collect();
294        v.sort_by_key(|(_, count)| *count);
295        v.truncate(n);
296        v
297    }
298
299    /// All frequencies, sorted alphabetically by predicate IRI.
300    pub fn all_sorted(&self) -> Vec<(String, u64)> {
301        let mut v: Vec<_> = self
302            .frequencies
303            .iter()
304            .map(|(k, &v)| (k.clone(), v))
305            .collect();
306        v.sort_by(|a, b| a.0.cmp(&b.0));
307        v
308    }
309}
310
311// ---------------------------------------------------------------------------
312// CardinalityEstimator
313// ---------------------------------------------------------------------------
314
315/// Estimates join cardinality from predicate histograms and overall graph statistics.
316///
317/// Uses independence assumptions (suitable for cost-based query planning) to estimate
318/// the result size of a triple pattern join.
319#[derive(Debug, Clone, Default)]
320pub struct CardinalityEstimator {
321    total_triples: u64,
322    distinct_subjects: u64,
323    distinct_objects: u64,
324    predicate_freq: HashMap<String, u64>,
325}
326
327impl CardinalityEstimator {
328    pub fn new() -> Self {
329        Self::default()
330    }
331
332    /// Build a `CardinalityEstimator` from a `GraphStatistics` instance.
333    pub fn from_graph_stats(stats: &GraphStatistics) -> Self {
334        Self {
335            total_triples: stats.total_triples(),
336            distinct_subjects: stats.distinct_subjects(),
337            distinct_objects: stats.distinct_objects(),
338            predicate_freq: stats.predicate_frequency.clone(),
339        }
340    }
341
342    /// Estimate the cardinality of a single triple pattern `(s, p, o)`.
343    ///
344    /// Each bound slot multiplies the selectivity estimate:
345    /// - Bound subject: selectivity ≈ 1 / distinct_subjects
346    /// - Bound predicate: selectivity ≈ frequency(p) / total_triples
347    /// - Bound object: selectivity ≈ 1 / distinct_objects
348    pub fn estimate_pattern(
349        &self,
350        subject_bound: bool,
351        predicate: Option<&str>,
352        object_bound: bool,
353    ) -> u64 {
354        if self.total_triples == 0 {
355            return 0;
356        }
357        let n = self.total_triples as f64;
358        let mut sel = 1.0_f64;
359
360        if subject_bound {
361            let ds = self.distinct_subjects.max(1) as f64;
362            sel *= 1.0 / ds;
363        }
364
365        if let Some(p) = predicate {
366            let freq = *self.predicate_freq.get(p).unwrap_or(&0) as f64;
367            if freq == 0.0 {
368                return 1; // unknown predicate → conservative estimate
369            }
370            sel *= freq / n;
371        }
372
373        if object_bound {
374            let dobj = self.distinct_objects.max(1) as f64;
375            sel *= 1.0 / dobj;
376        }
377
378        (n * sel).ceil() as u64
379    }
380
381    /// Estimate the cardinality of joining two triple patterns (independence assumption).
382    pub fn estimate_join(&self, left_card: u64, right_card: u64, shared_vars: usize) -> u64 {
383        if self.total_triples == 0 || left_card == 0 || right_card == 0 {
384            return 0;
385        }
386        let n = self.total_triples.max(1) as f64;
387        // Each shared variable reduces the result by a factor of n
388        let reduction = n.powi(shared_vars as i32);
389        ((left_card as f64 * right_card as f64 / reduction).ceil() as u64).max(1)
390    }
391
392    /// Update cardinality stats from a `GraphStatistics` instance.
393    pub fn update_from(&mut self, stats: &GraphStatistics) {
394        self.total_triples = stats.total_triples();
395        self.distinct_subjects = stats.distinct_subjects();
396        self.distinct_objects = stats.distinct_objects();
397        self.predicate_freq = stats.predicate_frequency.clone();
398    }
399}
400
401// ---------------------------------------------------------------------------
402// SampledStatistics
403// ---------------------------------------------------------------------------
404
405/// Approximate statistics computed from a random sample of the triple store.
406///
407/// Useful for large graphs where computing exact statistics is too expensive.
408/// Uses reservoir sampling semantics — the caller feeds triples and the structure
409/// maintains a representative sample of size `sample_size`.
410#[derive(Debug, Clone)]
411pub struct SampledStatistics {
412    sample_size: usize,
413    sample: Vec<(String, String, String)>,
414    total_seen: u64,
415    graph_stats: GraphStatistics,
416}
417
418impl SampledStatistics {
419    /// Create a new sampler that will keep at most `sample_size` triples.
420    pub fn new(sample_size: usize) -> Self {
421        Self {
422            sample_size,
423            sample: Vec::with_capacity(sample_size),
424            total_seen: 0,
425            graph_stats: GraphStatistics::new(),
426        }
427    }
428
429    /// Feed a triple to the sampler.  Uses reservoir sampling to decide whether to keep it.
430    ///
431    /// Because true reservoir sampling requires a PRNG, this implementation uses a deterministic
432    /// hash-based pseudo-random selection so that tests are reproducible without an external RNG.
433    pub fn observe(&mut self, subject: &str, predicate: &str, object: &str) {
434        self.total_seen += 1;
435        self.graph_stats.add_triple(subject, predicate, object);
436
437        if self.sample.len() < self.sample_size {
438            self.sample.push((
439                subject.to_string(),
440                predicate.to_string(),
441                object.to_string(),
442            ));
443        } else {
444            // Deterministic pseudo-random index using FNV of the concatenated triple + counter
445            let hash = Self::hash3(subject, predicate, object, self.total_seen);
446            let idx = (hash % self.total_seen) as usize;
447            if idx < self.sample_size {
448                self.sample[idx] = (
449                    subject.to_string(),
450                    predicate.to_string(),
451                    object.to_string(),
452                );
453            }
454        }
455    }
456
457    fn hash3(s: &str, p: &str, o: &str, n: u64) -> u64 {
458        const FNV_OFFSET: u64 = 14_695_981_039_346_656_037;
459        const FNV_PRIME: u64 = 1_099_511_628_211;
460        let mut h = FNV_OFFSET;
461        for b in s.bytes().chain(p.bytes()).chain(o.bytes()) {
462            h ^= b as u64;
463            h = h.wrapping_mul(FNV_PRIME);
464        }
465        h ^= n;
466        h = h.wrapping_mul(FNV_PRIME);
467        h
468    }
469
470    /// Number of triples seen so far (population size).
471    pub fn total_seen(&self) -> u64 {
472        self.total_seen
473    }
474
475    /// Number of triples in the sample.
476    pub fn sample_size(&self) -> usize {
477        self.sample.len()
478    }
479
480    /// Access the raw sample.
481    pub fn sample(&self) -> &[(String, String, String)] {
482        &self.sample
483    }
484
485    /// Estimated total triples (same as `total_seen`, since each `observe` is one triple).
486    pub fn estimated_total(&self) -> u64 {
487        self.total_seen
488    }
489
490    /// Estimated number of distinct predicates, extrapolated from the sample.
491    pub fn estimated_distinct_predicates(&self) -> u64 {
492        let sample_distinct = self.graph_stats.distinct_predicates();
493        if self.sample.is_empty() || self.total_seen == 0 {
494            return sample_distinct;
495        }
496        // Scale-up: assume same ratio holds for the population
497        let ratio = self.total_seen as f64 / self.sample.len() as f64;
498        (sample_distinct as f64 * ratio.sqrt()).ceil() as u64
499    }
500
501    /// Estimated cardinality for a triple pattern using sample-based selectivity.
502    pub fn estimate_cardinality(
503        &self,
504        subject: Option<&str>,
505        predicate: Option<&str>,
506        object: Option<&str>,
507    ) -> u64 {
508        self.graph_stats
509            .estimate_cardinality(subject, predicate, object)
510    }
511
512    /// Access the underlying sample-based `GraphStatistics`.
513    pub fn graph_stats(&self) -> &GraphStatistics {
514        &self.graph_stats
515    }
516}
517
518// ---------------------------------------------------------------------------
519// Tests
520// ---------------------------------------------------------------------------
521
522#[cfg(test)]
523mod tests {
524    use super::*;
525
526    // ---- GraphStatistics ----
527
528    #[test]
529    fn test_graph_stats_empty() {
530        let gs = GraphStatistics::new();
531        assert_eq!(gs.total_triples(), 0);
532        assert_eq!(gs.distinct_subjects(), 0);
533        assert_eq!(gs.distinct_predicates(), 0);
534        assert_eq!(gs.distinct_objects(), 0);
535    }
536
537    #[test]
538    fn test_graph_stats_add_single_triple() {
539        let mut gs = GraphStatistics::new();
540        gs.add_triple("alice", "knows", "bob");
541        assert_eq!(gs.total_triples(), 1);
542        assert_eq!(gs.distinct_subjects(), 1);
543        assert_eq!(gs.distinct_predicates(), 1);
544        assert_eq!(gs.distinct_objects(), 1);
545    }
546
547    #[test]
548    fn test_graph_stats_add_multiple_triples_same_predicate() {
549        let mut gs = GraphStatistics::new();
550        gs.add_triple("alice", "knows", "bob");
551        gs.add_triple("alice", "knows", "carol");
552        gs.add_triple("bob", "knows", "carol");
553        assert_eq!(gs.total_triples(), 3);
554        assert_eq!(gs.distinct_predicates(), 1);
555        assert_eq!(gs.predicate_frequency("knows"), 3);
556    }
557
558    #[test]
559    fn test_graph_stats_distinct_counts_correct() {
560        let mut gs = GraphStatistics::new();
561        gs.add_triple("s1", "p1", "o1");
562        gs.add_triple("s1", "p2", "o2");
563        gs.add_triple("s2", "p1", "o1");
564        assert_eq!(gs.distinct_subjects(), 2);
565        assert_eq!(gs.distinct_predicates(), 2);
566        assert_eq!(gs.distinct_objects(), 2);
567    }
568
569    #[test]
570    fn test_graph_stats_remove_triple() {
571        let mut gs = GraphStatistics::new();
572        gs.add_triple("s", "p", "o");
573        gs.remove_triple("s", "p", "o");
574        assert_eq!(gs.total_triples(), 0);
575        assert_eq!(gs.distinct_subjects(), 0);
576        assert_eq!(gs.distinct_predicates(), 0);
577        assert_eq!(gs.distinct_objects(), 0);
578    }
579
580    #[test]
581    fn test_graph_stats_remove_partial() {
582        let mut gs = GraphStatistics::new();
583        gs.add_triple("s", "p", "o1");
584        gs.add_triple("s", "p", "o2");
585        gs.remove_triple("s", "p", "o1");
586        assert_eq!(gs.total_triples(), 1);
587        assert_eq!(gs.distinct_subjects(), 1); // s still present
588        assert_eq!(gs.distinct_objects(), 1); // only o2 left
589    }
590
591    #[test]
592    fn test_graph_stats_remove_nonexistent_is_noop() {
593        let mut gs = GraphStatistics::new();
594        gs.remove_triple("nonexistent", "p", "o"); // should not panic
595        assert_eq!(gs.total_triples(), 0);
596    }
597
598    #[test]
599    fn test_graph_stats_estimate_cardinality_wildcard() {
600        let mut gs = GraphStatistics::new();
601        for i in 0..100 {
602            gs.add_triple(&format!("s{i}"), "p", "o");
603        }
604        let card = gs.estimate_cardinality(None, None, None);
605        assert_eq!(card, 100);
606    }
607
608    #[test]
609    fn test_graph_stats_estimate_cardinality_bound_predicate() {
610        let mut gs = GraphStatistics::new();
611        for i in 0..100 {
612            gs.add_triple(&format!("s{i}"), "p1", "o");
613            gs.add_triple(&format!("s{i}"), "p2", "o");
614        }
615        let card = gs.estimate_cardinality(None, Some("p1"), None);
616        assert!(card <= 100);
617        assert!(card >= 1);
618    }
619
620    #[test]
621    fn test_graph_stats_estimate_cardinality_unknown_predicate() {
622        let mut gs = GraphStatistics::new();
623        gs.add_triple("s", "p", "o");
624        let card = gs.estimate_cardinality(None, Some("nonexistent"), None);
625        assert_eq!(card, 0);
626    }
627
628    #[test]
629    fn test_graph_stats_top_predicates() {
630        let mut gs = GraphStatistics::new();
631        for _ in 0..10 {
632            gs.add_triple("s", "popular", "o");
633        }
634        for _ in 0..2 {
635            gs.add_triple("s", "rare", "o");
636        }
637        let top = gs.top_predicates(1);
638        assert_eq!(top[0].0, "popular");
639        assert_eq!(top[0].1, 10);
640    }
641
642    #[test]
643    fn test_graph_stats_merge() {
644        let mut gs1 = GraphStatistics::new();
645        gs1.add_triple("s1", "p", "o1");
646        let mut gs2 = GraphStatistics::new();
647        gs2.add_triple("s2", "p", "o2");
648        gs1.merge(&gs2);
649        assert_eq!(gs1.total_triples(), 2);
650        assert_eq!(gs1.distinct_subjects(), 2);
651    }
652
653    // ---- PredicateHistogram ----
654
655    #[test]
656    fn test_predicate_histogram_empty() {
657        let ph = PredicateHistogram::new();
658        assert_eq!(ph.total(), 0);
659        assert_eq!(ph.distinct_count(), 0);
660        assert_eq!(ph.frequency("any"), 0);
661    }
662
663    #[test]
664    fn test_predicate_histogram_record() {
665        let mut ph = PredicateHistogram::new();
666        ph.record("p1");
667        ph.record("p1");
668        ph.record("p2");
669        assert_eq!(ph.total(), 3);
670        assert_eq!(ph.frequency("p1"), 2);
671        assert_eq!(ph.frequency("p2"), 1);
672        assert_eq!(ph.distinct_count(), 2);
673    }
674
675    #[test]
676    fn test_predicate_histogram_relative_frequency() {
677        let mut ph = PredicateHistogram::new();
678        for _ in 0..3 {
679            ph.record("a");
680        }
681        for _ in 0..7 {
682            ph.record("b");
683        }
684        let rf_a = ph.relative_frequency("a");
685        let rf_b = ph.relative_frequency("b");
686        assert!((rf_a - 0.3).abs() < 1e-9);
687        assert!((rf_b - 0.7).abs() < 1e-9);
688    }
689
690    #[test]
691    fn test_predicate_histogram_remove() {
692        let mut ph = PredicateHistogram::new();
693        ph.record("p");
694        ph.record("p");
695        ph.remove("p");
696        assert_eq!(ph.frequency("p"), 1);
697        ph.remove("p");
698        assert_eq!(ph.frequency("p"), 0);
699        assert_eq!(ph.distinct_count(), 0);
700    }
701
702    #[test]
703    fn test_predicate_histogram_top_n() {
704        let mut ph = PredicateHistogram::new();
705        for i in 0..5 {
706            for _ in 0..(i + 1) {
707                ph.record(&format!("p{i}"));
708            }
709        }
710        let top = ph.top_n(2);
711        assert_eq!(top.len(), 2);
712        assert!(top[0].1 >= top[1].1);
713    }
714
715    #[test]
716    fn test_predicate_histogram_bottom_n() {
717        let mut ph = PredicateHistogram::new();
718        for _ in 0..10 {
719            ph.record("common");
720        }
721        for _ in 0..1 {
722            ph.record("rare");
723        }
724        let bottom = ph.bottom_n(1);
725        assert_eq!(bottom[0].0, "rare");
726    }
727
728    #[test]
729    fn test_predicate_histogram_all_sorted() {
730        let mut ph = PredicateHistogram::new();
731        ph.record("zebra");
732        ph.record("apple");
733        ph.record("mango");
734        let sorted = ph.all_sorted();
735        assert_eq!(sorted[0].0, "apple");
736        assert_eq!(sorted[1].0, "mango");
737        assert_eq!(sorted[2].0, "zebra");
738    }
739
740    // ---- CardinalityEstimator ----
741
742    #[test]
743    fn test_cardinality_estimator_empty() {
744        let ce = CardinalityEstimator::new();
745        assert_eq!(ce.estimate_pattern(false, None, false), 0);
746    }
747
748    #[test]
749    fn test_cardinality_estimator_from_graph_stats() {
750        let mut gs = GraphStatistics::new();
751        for i in 0..50 {
752            gs.add_triple(&format!("s{i}"), "knows", "o");
753        }
754        let ce = CardinalityEstimator::from_graph_stats(&gs);
755        // Pattern: ?s knows ?o → all 50 triples
756        let card = ce.estimate_pattern(false, Some("knows"), false);
757        assert_eq!(card, 50);
758    }
759
760    #[test]
761    fn test_cardinality_estimator_bound_subject_reduces_card() {
762        let mut gs = GraphStatistics::new();
763        for i in 0..100 {
764            gs.add_triple(&format!("s{i}"), "p", "o");
765        }
766        let ce = CardinalityEstimator::from_graph_stats(&gs);
767        let card_with = ce.estimate_pattern(true, None, false);
768        let card_without = ce.estimate_pattern(false, None, false);
769        assert!(card_with <= card_without);
770    }
771
772    #[test]
773    fn test_cardinality_estimator_unknown_predicate() {
774        let mut gs = GraphStatistics::new();
775        gs.add_triple("s", "known_p", "o");
776        let ce = CardinalityEstimator::from_graph_stats(&gs);
777        let card = ce.estimate_pattern(false, Some("unknown_p"), false);
778        assert_eq!(card, 1); // conservative estimate
779    }
780
781    #[test]
782    fn test_cardinality_estimator_estimate_join() {
783        let mut gs = GraphStatistics::new();
784        for i in 0..100 {
785            gs.add_triple(&format!("s{i}"), "p", "o");
786        }
787        let ce = CardinalityEstimator::from_graph_stats(&gs);
788        let join_card = ce.estimate_join(50, 50, 1);
789        // join_card = ceil(50 * 50 / 100) = 25
790        assert_eq!(join_card, 25);
791    }
792
793    #[test]
794    fn test_cardinality_estimator_join_zero() {
795        let ce = CardinalityEstimator::new();
796        assert_eq!(ce.estimate_join(10, 10, 0), 0);
797    }
798
799    #[test]
800    fn test_cardinality_estimator_update_from() {
801        let ce_before = CardinalityEstimator::new();
802        assert_eq!(ce_before.estimate_pattern(false, None, false), 0);
803        let mut gs = GraphStatistics::new();
804        gs.add_triple("s", "p", "o");
805        let mut ce = CardinalityEstimator::new();
806        ce.update_from(&gs);
807        assert_eq!(ce.estimate_pattern(false, None, false), 1);
808    }
809
810    // ---- SampledStatistics ----
811
812    #[test]
813    fn test_sampled_statistics_empty() {
814        let ss = SampledStatistics::new(100);
815        assert_eq!(ss.total_seen(), 0);
816        assert_eq!(ss.sample_size(), 0);
817    }
818
819    #[test]
820    fn test_sampled_statistics_observe_fewer_than_capacity() {
821        let mut ss = SampledStatistics::new(100);
822        ss.observe("s", "p", "o");
823        ss.observe("s2", "p2", "o2");
824        assert_eq!(ss.total_seen(), 2);
825        assert_eq!(ss.sample_size(), 2);
826    }
827
828    #[test]
829    fn test_sampled_statistics_observe_more_than_capacity() {
830        let mut ss = SampledStatistics::new(10);
831        for i in 0..50 {
832            ss.observe(&format!("s{i}"), "p", &format!("o{i}"));
833        }
834        assert_eq!(ss.total_seen(), 50);
835        assert_eq!(ss.sample_size(), 10);
836    }
837
838    #[test]
839    fn test_sampled_statistics_estimated_total() {
840        let mut ss = SampledStatistics::new(100);
841        for i in 0..200 {
842            ss.observe(&format!("s{i}"), "p", "o");
843        }
844        assert_eq!(ss.estimated_total(), 200);
845    }
846
847    #[test]
848    fn test_sampled_statistics_cardinality_from_sample() {
849        let mut ss = SampledStatistics::new(1000);
850        for i in 0..100 {
851            ss.observe(&format!("s{i}"), "p", "o");
852        }
853        let card = ss.estimate_cardinality(None, Some("p"), None);
854        assert!(card > 0);
855    }
856
857    #[test]
858    fn test_sampled_statistics_estimated_distinct_predicates() {
859        let mut ss = SampledStatistics::new(50);
860        for i in 0..5 {
861            ss.observe("s", &format!("p{i}"), "o");
862        }
863        let est = ss.estimated_distinct_predicates();
864        assert!(est >= 5);
865    }
866
867    #[test]
868    fn test_sampled_statistics_graph_stats_available() {
869        let mut ss = SampledStatistics::new(100);
870        ss.observe("alice", "knows", "bob");
871        let gs = ss.graph_stats();
872        assert_eq!(gs.total_triples(), 1);
873    }
874}
oxirs_core/statistics/graph_stats.rs

oxirs_core/statistics/
graph_stats.rs