oxirs_core/statistics/
histogram.rs

1//! Equi-depth histogram statistics for cardinality estimation in the cost-based optimizer.
2//!
3//! This module provides column-level statistics (histograms) over predicate-object distributions
4//! in an RDF dataset, enabling the optimizer to make better cardinality estimates when planning
5//! join orders and filter evaluation.
6//!
7//! # Design
8//!
9//! Each predicate gets its own [`PredicateHistogram`] that maintains an equi-depth histogram over
10//! the distribution of object values observed for that predicate. The histogram is built from raw
11//! value streams using a sorting-and-splitting approach, and can be updated incrementally (with a
12//! degraded accuracy guarantee) as new triples arrive.
13//!
14//! A [`DatasetStatistics`] struct aggregates all per-predicate histograms and provides the
15//! dataset-level estimates (subject/predicate/object counts) used by the optimizer.
16
17use std::collections::HashMap;
18use std::time::Instant;
19
20/// A single bucket in an equi-depth histogram.
21///
22/// All values in the range `[lower_bound, upper_bound]` (lexicographic) fall into this bucket.
23#[derive(Debug, Clone)]
24pub struct HistogramBucket {
25    /// Minimum value in this bucket (inclusive, string-encoded).
26    pub lower_bound: String,
27    /// Maximum value in this bucket (inclusive, string-encoded).
28    pub upper_bound: String,
29    /// Approximate number of values (triples) whose object falls in this bucket.
30    pub count: u64,
31    /// Approximate number of distinct values in this bucket.
32    pub distinct_count: u64,
33    /// Number of null / missing values encountered when building this bucket.
34    pub null_count: u64,
35}
36
37impl HistogramBucket {
38    /// Returns true if `value` falls within the closed range `[lower_bound, upper_bound]`.
39    pub fn contains(&self, value: &str) -> bool {
40        value >= self.lower_bound.as_str() && value <= self.upper_bound.as_str()
41    }
42
43    /// Returns the fraction `[0, 1]` of the bucket that is "covered" by the closed range
44    /// `[lo, hi]`.  Used for range selectivity estimation with linear interpolation.
45    pub fn overlap_fraction(&self, lo: &str, hi: &str) -> f64 {
46        // Both empty: full coverage.
47        if self.lower_bound.is_empty() && self.upper_bound.is_empty() {
48            return 1.0;
49        }
50        let lb = self.lower_bound.as_str();
51        let ub = self.upper_bound.as_str();
52
53        // No overlap at all.
54        if hi < lb || lo > ub {
55            return 0.0;
56        }
57
58        // Full containment.
59        if lo <= lb && hi >= ub {
60            return 1.0;
61        }
62
63        // Partial overlap — approximate via linear interpolation over lexicographic key space.
64        // We convert strings to a rough u64 "position" by looking at the first 8 bytes.
65        let to_pos = |s: &str| -> u64 {
66            let bytes = s.as_bytes();
67            let mut v: u64 = 0;
68            for (i, &b) in bytes.iter().enumerate().take(8) {
69                v |= (b as u64) << (56 - i * 8);
70            }
71            v
72        };
73
74        let pos_lb = to_pos(lb);
75        let pos_ub = to_pos(ub);
76        let pos_lo = to_pos(lo).max(pos_lb);
77        let pos_hi = to_pos(hi).min(pos_ub);
78
79        if pos_ub == pos_lb {
80            return 1.0;
81        }
82
83        let overlap = pos_hi.saturating_sub(pos_lo) as f64;
84        let total = (pos_ub - pos_lb) as f64;
85        (overlap / total).clamp(0.0, 1.0)
86    }
87}
88
89/// Equi-depth histogram for the object values of a specific predicate.
90///
91/// All buckets have approximately equal `count` (depth), spread across the ordered value space.
92#[derive(Debug, Clone)]
93pub struct PredicateHistogram {
94    /// IRI of the predicate this histogram covers.
95    pub predicate_iri: String,
96    /// Total number of non-null values seen.
97    pub total_count: u64,
98    /// Number of null / missing object values.
99    pub null_count: u64,
100    /// Number of globally distinct object values (approximate, from build phase).
101    pub distinct_count: u64,
102    /// Equi-depth buckets (sorted by lower_bound ascending).
103    pub buckets: Vec<HistogramBucket>,
104}
105
106impl PredicateHistogram {
107    /// Create an empty histogram for `predicate` with `num_buckets` bucket slots.
108    pub fn new(predicate: &str, num_buckets: usize) -> Self {
109        Self {
110            predicate_iri: predicate.to_owned(),
111            total_count: 0,
112            null_count: 0,
113            distinct_count: 0,
114            buckets: Vec::with_capacity(num_buckets),
115        }
116    }
117
118    /// Build an equi-depth histogram from `values`, targeting `num_buckets` buckets.
119    ///
120    /// The approach is:
121    /// 1. Sort all (non-empty) values lexicographically.
122    /// 2. Split into `num_buckets` equal-sized slices.
123    /// 3. For each slice compute distinct count via deduplication.
124    ///
125    /// Complexity: O(n log n) where n = `values.len()`.
126    pub fn build_from_values(predicate: &str, values: &[String], num_buckets: usize) -> Self {
127        let num_buckets = num_buckets.max(1);
128
129        let null_count = values.iter().filter(|v| v.is_empty()).count() as u64;
130
131        let mut sorted: Vec<&String> = values.iter().filter(|v| !v.is_empty()).collect();
132        sorted.sort_unstable();
133
134        let total_count = sorted.len() as u64;
135
136        // Count distinct values.
137        let mut distinct_count = 0u64;
138        {
139            let mut prev: Option<&str> = None;
140            for v in &sorted {
141                if prev != Some(v.as_str()) {
142                    distinct_count += 1;
143                    prev = Some(v.as_str());
144                }
145            }
146        }
147
148        let mut buckets: Vec<HistogramBucket> = Vec::with_capacity(num_buckets);
149
150        if sorted.is_empty() {
151            return Self {
152                predicate_iri: predicate.to_owned(),
153                total_count,
154                null_count,
155                distinct_count,
156                buckets,
157            };
158        }
159
160        let target_depth = (sorted.len() as f64 / num_buckets as f64).ceil() as usize;
161        let target_depth = target_depth.max(1);
162
163        let mut cursor = 0usize;
164        while cursor < sorted.len() {
165            let end = (cursor + target_depth).min(sorted.len());
166            let slice = &sorted[cursor..end];
167
168            // Move `end` forward to not split equal values across adjacent buckets.
169            let lower_bound = slice[0].clone();
170            let upper_bound = slice[slice.len() - 1].clone();
171
172            // Distinct count in this bucket.
173            let mut bucket_distinct = 0u64;
174            let mut prev: Option<&str> = None;
175            for v in slice {
176                if prev != Some(v.as_str()) {
177                    bucket_distinct += 1;
178                    prev = Some(v.as_str());
179                }
180            }
181
182            buckets.push(HistogramBucket {
183                lower_bound,
184                upper_bound,
185                count: slice.len() as u64,
186                distinct_count: bucket_distinct,
187                null_count: 0,
188            });
189
190            cursor = end;
191        }
192
193        Self {
194            predicate_iri: predicate.to_owned(),
195            total_count,
196            null_count,
197            distinct_count,
198            buckets,
199        }
200    }
201
202    /// Estimate the selectivity (fraction of rows) for an equality predicate `object = value`.
203    ///
204    /// Returns a value in `[0, 1]`.  Uses a uniform assumption within the containing bucket.
205    pub fn estimate_selectivity_eq(&self, value: &str) -> f64 {
206        if self.total_count == 0 {
207            return 0.0;
208        }
209
210        for bucket in &self.buckets {
211            if bucket.contains(value) {
212                let distinct = bucket.distinct_count.max(1) as f64;
213                let bucket_sel = bucket.count as f64 / self.total_count as f64;
214                // Assume uniform distribution within bucket.
215                return bucket_sel / distinct;
216            }
217        }
218
219        // Value not in any bucket — assume extremely rare.
220        // Use (distinct_count + 1) as denominator to model "one extra unseen value beyond all known
221        // distinct ones", which is strictly less than 1/distinct_count and much smaller than any
222        // in-bucket estimate.  Adding 1 prevents the edge case where distinct_count == total_count
223        // would yield exactly 1/total_count (i.e., 0.01 for 10 distinct values in 10 total).
224        let denominator =
225            (self.distinct_count.max(1) + 1) as f64 * (self.total_count.max(1) as f64);
226        1.0 / denominator
227    }
228
229    /// Estimate the selectivity for a range predicate `lo <= object <= hi`.
230    ///
231    /// Returns a value in `[0, 1]`.
232    pub fn estimate_selectivity_range(&self, lo: &str, hi: &str) -> f64 {
233        if self.total_count == 0 {
234            return 0.0;
235        }
236
237        let mut covered_rows: f64 = 0.0;
238        for bucket in &self.buckets {
239            let frac = bucket.overlap_fraction(lo, hi);
240            if frac > 0.0 {
241                covered_rows += frac * bucket.count as f64;
242            }
243        }
244
245        (covered_rows / self.total_count as f64).clamp(0.0, 1.0)
246    }
247
248    /// Estimate the expected cardinality for a pattern that filters this predicate's objects.
249    ///
250    /// If `predicate_filter` is `Some(value)` an equality filter is applied; otherwise all
251    /// triples with this predicate are returned.
252    ///
253    /// `total_triples` is used as a fallback denominator when `self.total_count` is zero.
254    pub fn estimate_cardinality(&self, total_triples: u64, predicate_filter: Option<&str>) -> u64 {
255        let base = if self.total_count > 0 {
256            self.total_count
257        } else {
258            total_triples
259        };
260
261        if base == 0 {
262            return 0;
263        }
264
265        match predicate_filter {
266            None => base,
267            Some(value) => {
268                let sel = self.estimate_selectivity_eq(value);
269                ((base as f64 * sel).ceil() as u64).max(1)
270            }
271        }
272    }
273
274    /// Incrementally update the histogram with additional observed values.
275    ///
276    /// This is an approximate update: new values are inserted into the most appropriate existing
277    /// bucket (lowest `lower_bound` not greater than the value), and global counts are adjusted.
278    /// After many incremental updates, accuracy degrades and a full rebuild is recommended.
279    pub fn update_incremental(&mut self, new_values: &[String]) {
280        for value in new_values {
281            if value.is_empty() {
282                self.null_count += 1;
283                continue;
284            }
285
286            self.total_count += 1;
287
288            // Find the bucket whose range contains the value, or the last bucket.
289            let idx = self
290                .buckets
291                .iter()
292                .rposition(|b| b.lower_bound.as_str() <= value.as_str())
293                .unwrap_or(0);
294
295            if let Some(bucket) = self.buckets.get_mut(idx) {
296                bucket.count += 1;
297                // Heuristic: if value is outside current range, expand bounds.
298                if value.as_str() < bucket.lower_bound.as_str() {
299                    bucket.lower_bound = value.clone();
300                }
301                if value.as_str() > bucket.upper_bound.as_str() {
302                    bucket.upper_bound = value.clone();
303                    // Ripple update: adjust distinct count (approximate).
304                    bucket.distinct_count += 1;
305                    self.distinct_count += 1;
306                }
307            } else if !self.buckets.is_empty() {
308                // Append to the last bucket.
309                let last = self
310                    .buckets
311                    .last_mut()
312                    .expect("buckets non-empty checked above");
313                last.count += 1;
314                if value.as_str() > last.upper_bound.as_str() {
315                    last.upper_bound = value.clone();
316                    last.distinct_count += 1;
317                    self.distinct_count += 1;
318                }
319            } else {
320                // No buckets yet — create a single bucket.
321                self.buckets.push(HistogramBucket {
322                    lower_bound: value.clone(),
323                    upper_bound: value.clone(),
324                    count: 1,
325                    distinct_count: 1,
326                    null_count: 0,
327                });
328                self.distinct_count += 1;
329            }
330        }
331    }
332
333    /// Returns the number of buckets in this histogram.
334    pub fn num_buckets(&self) -> usize {
335        self.buckets.len()
336    }
337}
338
339/// Dataset-level statistics aggregating per-predicate histograms.
340///
341/// Provides the interface expected by the cost-based optimizer for pattern cardinality estimation.
342pub struct DatasetStatistics {
343    /// Per-predicate equi-depth histograms, keyed by predicate IRI.
344    histograms: HashMap<String, PredicateHistogram>,
345    /// Total number of triples in the dataset.
346    total_triples: u64,
347    /// Approximate number of distinct subjects.
348    distinct_subjects: u64,
349    /// Approximate number of distinct predicates.
350    distinct_predicates: u64,
351    /// Approximate number of distinct objects.
352    distinct_objects: u64,
353    /// When statistics were last updated.
354    last_updated: Instant,
355    /// Target number of buckets per histogram (default: 100).
356    num_buckets: usize,
357}
358
359impl DatasetStatistics {
360    /// Create a new empty statistics store with the default bucket count (100).
361    pub fn new() -> Self {
362        Self::with_num_buckets(100)
363    }
364
365    /// Create a new statistics store with a custom bucket count.
366    pub fn with_num_buckets(num_buckets: usize) -> Self {
367        Self {
368            histograms: HashMap::new(),
369            total_triples: 0,
370            distinct_subjects: 0,
371            distinct_predicates: 0,
372            distinct_objects: 0,
373            last_updated: Instant::now(),
374            num_buckets: num_buckets.max(1),
375        }
376    }
377
378    /// Record the insertion of a single triple `(subject, predicate, object)`.
379    ///
380    /// Updates global counts and incrementally updates the per-predicate histogram.
381    pub fn record_triple(&mut self, predicate: &str, object: &str) {
382        self.total_triples += 1;
383
384        let num_buckets = self.num_buckets;
385        let histogram = self
386            .histograms
387            .entry(predicate.to_owned())
388            .or_insert_with(|| PredicateHistogram::new(predicate, num_buckets));
389
390        histogram.update_incremental(&[object.to_owned()]);
391
392        self.distinct_predicates = self.histograms.len() as u64;
393        self.last_updated = Instant::now();
394    }
395
396    /// Return the histogram for `predicate`, if one exists.
397    pub fn get_histogram(&self, predicate: &str) -> Option<&PredicateHistogram> {
398        self.histograms.get(predicate)
399    }
400
401    /// Estimate the cardinality of the triple pattern `(?s, predicate?, object?)`.
402    ///
403    /// Pattern selectivity is computed as:
404    /// - Unknown subject and unknown predicate → `total_triples`.
405    /// - Known predicate, unknown object → `histogram.total_count`.
406    /// - Known predicate + known object → equality filter estimate.
407    /// - Known object, unknown predicate → brute-force across all histograms.
408    /// - Unknown predicate and unknown object → `total_triples`.
409    pub fn estimate_pattern_cardinality(
410        &self,
411        subject: Option<&str>,
412        predicate: Option<&str>,
413        object: Option<&str>,
414    ) -> u64 {
415        if self.total_triples == 0 {
416            return 0;
417        }
418
419        // Subject binding multiplies selectivity by ~1/distinct_subjects.
420        let subject_selectivity = if subject.is_some() {
421            if self.distinct_subjects > 1 {
422                1.0 / self.distinct_subjects as f64
423            } else {
424                0.001
425            }
426        } else {
427            1.0
428        };
429
430        let base_cardinality: f64 = match (predicate, object) {
431            (None, None) => self.total_triples as f64,
432
433            (Some(pred), None) => {
434                // All triples with this predicate.
435                self.histograms
436                    .get(pred)
437                    .map(|h| h.total_count as f64)
438                    .unwrap_or_else(|| {
439                        // Unknown predicate — assume uniform distribution.
440                        if self.distinct_predicates > 0 {
441                            self.total_triples as f64 / self.distinct_predicates as f64
442                        } else {
443                            self.total_triples as f64 * 0.1
444                        }
445                    })
446            }
447
448            (Some(pred), Some(obj)) => {
449                // Equality filter on object for a specific predicate.
450                match self.histograms.get(pred) {
451                    Some(hist) => {
452                        let sel = hist.estimate_selectivity_eq(obj);
453                        (hist.total_count as f64 * sel).max(1.0)
454                    }
455                    None => 1.0, // Unknown predicate → assume at most 1 result.
456                }
457            }
458
459            (None, Some(obj)) => {
460                // Sum estimates across all predicates for the given object value.
461                if self.histograms.is_empty() {
462                    return 0;
463                }
464                self.histograms
465                    .values()
466                    .map(|hist| {
467                        let sel = hist.estimate_selectivity_eq(obj);
468                        hist.total_count as f64 * sel
469                    })
470                    .sum::<f64>()
471                    .max(1.0)
472            }
473        };
474
475        ((base_cardinality * subject_selectivity).ceil() as u64).max(1)
476    }
477
478    /// Rebuild all histograms from a full scan of `(predicate, object)` pairs.
479    ///
480    /// This is an expensive O(n log n) operation and should be run during off-peak periods or
481    /// triggered by the auto-analyze subsystem.
482    pub fn rebuild_histograms(&mut self, values: &[(String, String)]) {
483        self.histograms.clear();
484        self.total_triples = values.len() as u64;
485
486        // Group by predicate.
487        let mut by_predicate: HashMap<&str, Vec<&String>> = HashMap::new();
488        for (pred, obj) in values {
489            by_predicate.entry(pred.as_str()).or_default().push(obj);
490        }
491
492        let num_buckets = self.num_buckets;
493        for (pred, objs) in &by_predicate {
494            let owned_objs: Vec<String> = objs.iter().map(|s| (*s).clone()).collect();
495            let histogram = PredicateHistogram::build_from_values(pred, &owned_objs, num_buckets);
496            self.histograms.insert(pred.to_string(), histogram);
497        }
498
499        // Update distinct counts.
500        self.distinct_predicates = self.histograms.len() as u64;
501        // Approximate distinct objects as the sum of distinct objects per predicate
502        // (may overcount if the same literal appears under different predicates).
503        self.distinct_objects = self.histograms.values().map(|h| h.distinct_count).sum();
504
505        self.last_updated = Instant::now();
506    }
507
508    /// Return the total number of triples tracked by these statistics.
509    pub fn total_triples(&self) -> u64 {
510        self.total_triples
511    }
512
513    /// Update the known distinct subject count.
514    ///
515    /// Called by the store layer after it recomputes subject cardinality.
516    pub fn set_distinct_subjects(&mut self, count: u64) {
517        self.distinct_subjects = count;
518    }
519
520    /// Update the known distinct object count.
521    pub fn set_distinct_objects(&mut self, count: u64) {
522        self.distinct_objects = count;
523    }
524
525    /// Return when the statistics were last updated.
526    pub fn last_updated(&self) -> Instant {
527        self.last_updated
528    }
529
530    /// Return the number of predicates with histograms.
531    pub fn predicate_count(&self) -> usize {
532        self.histograms.len()
533    }
534}
535
536impl Default for DatasetStatistics {
537    fn default() -> Self {
538        Self::new()
539    }
540}
541
542#[cfg(test)]
543mod tests {
544    use super::*;
545
546    // --- HistogramBucket tests ---
547
548    #[test]
549    fn test_bucket_contains() {
550        let bucket = HistogramBucket {
551            lower_bound: "apple".to_string(),
552            upper_bound: "mango".to_string(),
553            count: 10,
554            distinct_count: 5,
555            null_count: 0,
556        };
557        assert!(bucket.contains("apple"));
558        assert!(bucket.contains("cherry"));
559        assert!(bucket.contains("mango"));
560        assert!(!bucket.contains("aardvark")); // < lower_bound
561        assert!(!bucket.contains("zebra")); // > upper_bound
562    }
563
564    #[test]
565    fn test_bucket_overlap_fraction_full() {
566        let bucket = HistogramBucket {
567            lower_bound: "b".to_string(),
568            upper_bound: "m".to_string(),
569            count: 100,
570            distinct_count: 50,
571            null_count: 0,
572        };
573        // Full containment
574        let frac = bucket.overlap_fraction("a", "z");
575        assert!(frac > 0.99, "Expected ~1.0 but got {}", frac);
576    }
577
578    #[test]
579    fn test_bucket_overlap_fraction_none() {
580        let bucket = HistogramBucket {
581            lower_bound: "m".to_string(),
582            upper_bound: "z".to_string(),
583            count: 100,
584            distinct_count: 50,
585            null_count: 0,
586        };
587        // No overlap
588        let frac = bucket.overlap_fraction("a", "b");
589        assert_eq!(frac, 0.0);
590    }
591
592    // --- PredicateHistogram tests ---
593
594    #[test]
595    fn test_build_from_empty_values() {
596        let hist = PredicateHistogram::build_from_values("http://example.org/p", &[], 10);
597        assert_eq!(hist.total_count, 0);
598        assert_eq!(hist.buckets.len(), 0);
599    }
600
601    #[test]
602    fn test_build_from_values_single_bucket() {
603        let values: Vec<String> = (0..5).map(|i| format!("value{}", i)).collect();
604        let hist = PredicateHistogram::build_from_values("p", &values, 1);
605        assert_eq!(hist.total_count, 5);
606        assert_eq!(hist.buckets.len(), 1);
607        assert_eq!(hist.buckets[0].count, 5);
608    }
609
610    #[test]
611    fn test_build_from_values_multiple_buckets() {
612        let values: Vec<String> = (0..100).map(|i| format!("value{:03}", i)).collect();
613        let hist = PredicateHistogram::build_from_values("p", &values, 10);
614        assert_eq!(hist.total_count, 100);
615        assert_eq!(hist.distinct_count, 100);
616        // 10 buckets, each ~10 entries
617        assert!(hist.buckets.len() >= 9 && hist.buckets.len() <= 11);
618
619        // Total count across buckets must equal total_count
620        let sum: u64 = hist.buckets.iter().map(|b| b.count).sum();
621        assert_eq!(sum, 100);
622    }
623
624    #[test]
625    fn test_build_handles_nulls() {
626        let mut values: Vec<String> = (0..10).map(|i| format!("v{}", i)).collect();
627        values.push(String::new()); // null
628        values.push(String::new()); // null
629        let hist = PredicateHistogram::build_from_values("p", &values, 5);
630        assert_eq!(hist.null_count, 2);
631        assert_eq!(hist.total_count, 10);
632    }
633
634    #[test]
635    fn test_estimate_selectivity_eq_within_bucket() {
636        let values: Vec<String> = vec![
637            "alpha".to_string(),
638            "beta".to_string(),
639            "gamma".to_string(),
640            "delta".to_string(),
641            "epsilon".to_string(),
642        ];
643        let hist = PredicateHistogram::build_from_values("p", &values, 2);
644        // Selectivity for a value in the dataset must be > 0
645        let sel = hist.estimate_selectivity_eq("beta");
646        assert!(sel > 0.0 && sel <= 1.0, "Selectivity {} out of range", sel);
647    }
648
649    #[test]
650    fn test_estimate_selectivity_eq_missing_value() {
651        let values: Vec<String> = (0..10).map(|i| format!("value{}", i)).collect();
652        let hist = PredicateHistogram::build_from_values("p", &values, 5);
653        let sel = hist.estimate_selectivity_eq("zzz_not_in_dataset");
654        // Should be very small
655        assert!(
656            sel < 0.01,
657            "Selectivity for missing value too high: {}",
658            sel
659        );
660    }
661
662    #[test]
663    fn test_estimate_selectivity_range() {
664        let values: Vec<String> = (0..100).map(|i| format!("{:03}", i)).collect();
665        let hist = PredicateHistogram::build_from_values("p", &values, 10);
666        // Range covering ~50% of values
667        let sel = hist.estimate_selectivity_range("000", "049");
668        assert!(
669            sel > 0.3 && sel < 0.7,
670            "Range selectivity {} unexpected",
671            sel
672        );
673    }
674
675    #[test]
676    fn test_estimate_cardinality_no_filter() {
677        let values: Vec<String> = (0..50).map(|i| format!("v{}", i)).collect();
678        let hist = PredicateHistogram::build_from_values("p", &values, 5);
679        let card = hist.estimate_cardinality(1000, None);
680        assert_eq!(card, 50);
681    }
682
683    #[test]
684    fn test_estimate_cardinality_with_filter() {
685        let values: Vec<String> = (0..100).map(|i| format!("{:03}", i)).collect();
686        let hist = PredicateHistogram::build_from_values("p", &values, 10);
687        let card = hist.estimate_cardinality(1000, Some("050"));
688        // Should return at least 1
689        assert!(card >= 1);
690    }
691
692    #[test]
693    fn test_update_incremental() {
694        let initial: Vec<String> = (0..10).map(|i| format!("v{}", i)).collect();
695        let mut hist = PredicateHistogram::build_from_values("p", &initial, 3);
696        let before_count = hist.total_count;
697
698        let new_vals: Vec<String> = vec!["newA".to_string(), "newB".to_string()];
699        hist.update_incremental(&new_vals);
700
701        assert_eq!(hist.total_count, before_count + 2);
702    }
703
704    #[test]
705    fn test_update_incremental_null() {
706        let initial: Vec<String> = (0..5).map(|i| format!("v{}", i)).collect();
707        let mut hist = PredicateHistogram::build_from_values("p", &initial, 2);
708        hist.update_incremental(&[String::new()]);
709        assert_eq!(hist.null_count, 1);
710    }
711
712    // --- DatasetStatistics tests ---
713
714    #[test]
715    fn test_dataset_statistics_new() {
716        let stats = DatasetStatistics::new();
717        assert_eq!(stats.total_triples(), 0);
718        assert_eq!(stats.predicate_count(), 0);
719    }
720
721    #[test]
722    fn test_dataset_record_triple() {
723        let mut stats = DatasetStatistics::new();
724        stats.record_triple("http://example.org/age", "42");
725        stats.record_triple("http://example.org/age", "25");
726        stats.record_triple("http://example.org/name", "Alice");
727
728        assert_eq!(stats.total_triples(), 3);
729        assert_eq!(stats.predicate_count(), 2);
730        assert!(stats.get_histogram("http://example.org/age").is_some());
731    }
732
733    #[test]
734    fn test_dataset_estimate_pattern_cardinality_no_bindings() {
735        let mut stats = DatasetStatistics::new();
736        for i in 0..20 {
737            stats.record_triple("http://p", &format!("obj{}", i));
738        }
739        let card = stats.estimate_pattern_cardinality(None, None, None);
740        assert_eq!(card, 20);
741    }
742
743    #[test]
744    fn test_dataset_estimate_pattern_cardinality_known_predicate() {
745        let mut stats = DatasetStatistics::new();
746        for i in 0..30 {
747            stats.record_triple("http://p/name", &format!("name{}", i));
748        }
749        for i in 0..10 {
750            stats.record_triple("http://p/age", &format!("{}", i));
751        }
752        let card = stats.estimate_pattern_cardinality(None, Some("http://p/name"), None);
753        assert_eq!(card, 30);
754    }
755
756    #[test]
757    fn test_dataset_estimate_pattern_cardinality_eq_filter() {
758        let mut stats = DatasetStatistics::with_num_buckets(5);
759        for i in 0..100 {
760            stats.record_triple("http://p/color", &format!("color{:03}", i));
761        }
762        let card =
763            stats.estimate_pattern_cardinality(None, Some("http://p/color"), Some("color050"));
764        assert!(card >= 1, "Equality filter should return at least 1");
765    }
766
767    #[test]
768    fn test_dataset_rebuild_histograms() {
769        let mut stats = DatasetStatistics::new();
770        let pairs: Vec<(String, String)> = (0..200)
771            .map(|i| ("http://p".to_string(), format!("v{:04}", i)))
772            .collect();
773        stats.rebuild_histograms(&pairs);
774
775        assert_eq!(stats.total_triples(), 200);
776        assert_eq!(stats.predicate_count(), 1);
777
778        let hist = stats
779            .get_histogram("http://p")
780            .expect("histogram must exist");
781        assert_eq!(hist.total_count, 200);
782    }
783
784    #[test]
785    fn test_dataset_set_distinct_subjects() {
786        let mut stats = DatasetStatistics::new();
787        stats.record_triple("p", "o");
788        stats.set_distinct_subjects(500);
789        // Estimate with subject binding should give lower cardinality.
790        let with_subject = stats.estimate_pattern_cardinality(Some("s"), Some("p"), None);
791        let without_subject = stats.estimate_pattern_cardinality(None, Some("p"), None);
792        assert!(with_subject <= without_subject);
793    }
794
795    #[test]
796    fn test_dataset_unknown_predicate_estimate() {
797        let mut stats = DatasetStatistics::new();
798        for i in 0..90 {
799            stats.record_triple("http://p/known", &format!("v{}", i));
800        }
801        // Unknown predicate — should fall back to uniform estimate.
802        let card = stats.estimate_pattern_cardinality(None, Some("http://p/unknown"), None);
803        assert!(card >= 1);
804    }
805}
oxirs_core/statistics/histogram.rs

oxirs_core/statistics/
histogram.rs