1use std::collections::HashMap;
18use std::time::Instant;
19
20#[derive(Debug, Clone)]
24pub struct HistogramBucket {
25 pub lower_bound: String,
27 pub upper_bound: String,
29 pub count: u64,
31 pub distinct_count: u64,
33 pub null_count: u64,
35}
36
37impl HistogramBucket {
38 pub fn contains(&self, value: &str) -> bool {
40 value >= self.lower_bound.as_str() && value <= self.upper_bound.as_str()
41 }
42
43 pub fn overlap_fraction(&self, lo: &str, hi: &str) -> f64 {
46 if self.lower_bound.is_empty() && self.upper_bound.is_empty() {
48 return 1.0;
49 }
50 let lb = self.lower_bound.as_str();
51 let ub = self.upper_bound.as_str();
52
53 if hi < lb || lo > ub {
55 return 0.0;
56 }
57
58 if lo <= lb && hi >= ub {
60 return 1.0;
61 }
62
63 let to_pos = |s: &str| -> u64 {
66 let bytes = s.as_bytes();
67 let mut v: u64 = 0;
68 for (i, &b) in bytes.iter().enumerate().take(8) {
69 v |= (b as u64) << (56 - i * 8);
70 }
71 v
72 };
73
74 let pos_lb = to_pos(lb);
75 let pos_ub = to_pos(ub);
76 let pos_lo = to_pos(lo).max(pos_lb);
77 let pos_hi = to_pos(hi).min(pos_ub);
78
79 if pos_ub == pos_lb {
80 return 1.0;
81 }
82
83 let overlap = pos_hi.saturating_sub(pos_lo) as f64;
84 let total = (pos_ub - pos_lb) as f64;
85 (overlap / total).clamp(0.0, 1.0)
86 }
87}
88
89#[derive(Debug, Clone)]
93pub struct PredicateHistogram {
94 pub predicate_iri: String,
96 pub total_count: u64,
98 pub null_count: u64,
100 pub distinct_count: u64,
102 pub buckets: Vec<HistogramBucket>,
104}
105
106impl PredicateHistogram {
107 pub fn new(predicate: &str, num_buckets: usize) -> Self {
109 Self {
110 predicate_iri: predicate.to_owned(),
111 total_count: 0,
112 null_count: 0,
113 distinct_count: 0,
114 buckets: Vec::with_capacity(num_buckets),
115 }
116 }
117
118 pub fn build_from_values(predicate: &str, values: &[String], num_buckets: usize) -> Self {
127 let num_buckets = num_buckets.max(1);
128
129 let null_count = values.iter().filter(|v| v.is_empty()).count() as u64;
130
131 let mut sorted: Vec<&String> = values.iter().filter(|v| !v.is_empty()).collect();
132 sorted.sort_unstable();
133
134 let total_count = sorted.len() as u64;
135
136 let mut distinct_count = 0u64;
138 {
139 let mut prev: Option<&str> = None;
140 for v in &sorted {
141 if prev != Some(v.as_str()) {
142 distinct_count += 1;
143 prev = Some(v.as_str());
144 }
145 }
146 }
147
148 let mut buckets: Vec<HistogramBucket> = Vec::with_capacity(num_buckets);
149
150 if sorted.is_empty() {
151 return Self {
152 predicate_iri: predicate.to_owned(),
153 total_count,
154 null_count,
155 distinct_count,
156 buckets,
157 };
158 }
159
160 let target_depth = (sorted.len() as f64 / num_buckets as f64).ceil() as usize;
161 let target_depth = target_depth.max(1);
162
163 let mut cursor = 0usize;
164 while cursor < sorted.len() {
165 let end = (cursor + target_depth).min(sorted.len());
166 let slice = &sorted[cursor..end];
167
168 let lower_bound = slice[0].clone();
170 let upper_bound = slice[slice.len() - 1].clone();
171
172 let mut bucket_distinct = 0u64;
174 let mut prev: Option<&str> = None;
175 for v in slice {
176 if prev != Some(v.as_str()) {
177 bucket_distinct += 1;
178 prev = Some(v.as_str());
179 }
180 }
181
182 buckets.push(HistogramBucket {
183 lower_bound,
184 upper_bound,
185 count: slice.len() as u64,
186 distinct_count: bucket_distinct,
187 null_count: 0,
188 });
189
190 cursor = end;
191 }
192
193 Self {
194 predicate_iri: predicate.to_owned(),
195 total_count,
196 null_count,
197 distinct_count,
198 buckets,
199 }
200 }
201
202 pub fn estimate_selectivity_eq(&self, value: &str) -> f64 {
206 if self.total_count == 0 {
207 return 0.0;
208 }
209
210 for bucket in &self.buckets {
211 if bucket.contains(value) {
212 let distinct = bucket.distinct_count.max(1) as f64;
213 let bucket_sel = bucket.count as f64 / self.total_count as f64;
214 return bucket_sel / distinct;
216 }
217 }
218
219 let denominator =
225 (self.distinct_count.max(1) + 1) as f64 * (self.total_count.max(1) as f64);
226 1.0 / denominator
227 }
228
229 pub fn estimate_selectivity_range(&self, lo: &str, hi: &str) -> f64 {
233 if self.total_count == 0 {
234 return 0.0;
235 }
236
237 let mut covered_rows: f64 = 0.0;
238 for bucket in &self.buckets {
239 let frac = bucket.overlap_fraction(lo, hi);
240 if frac > 0.0 {
241 covered_rows += frac * bucket.count as f64;
242 }
243 }
244
245 (covered_rows / self.total_count as f64).clamp(0.0, 1.0)
246 }
247
248 pub fn estimate_cardinality(&self, total_triples: u64, predicate_filter: Option<&str>) -> u64 {
255 let base = if self.total_count > 0 {
256 self.total_count
257 } else {
258 total_triples
259 };
260
261 if base == 0 {
262 return 0;
263 }
264
265 match predicate_filter {
266 None => base,
267 Some(value) => {
268 let sel = self.estimate_selectivity_eq(value);
269 ((base as f64 * sel).ceil() as u64).max(1)
270 }
271 }
272 }
273
274 pub fn update_incremental(&mut self, new_values: &[String]) {
280 for value in new_values {
281 if value.is_empty() {
282 self.null_count += 1;
283 continue;
284 }
285
286 self.total_count += 1;
287
288 let idx = self
290 .buckets
291 .iter()
292 .rposition(|b| b.lower_bound.as_str() <= value.as_str())
293 .unwrap_or(0);
294
295 if let Some(bucket) = self.buckets.get_mut(idx) {
296 bucket.count += 1;
297 if value.as_str() < bucket.lower_bound.as_str() {
299 bucket.lower_bound = value.clone();
300 }
301 if value.as_str() > bucket.upper_bound.as_str() {
302 bucket.upper_bound = value.clone();
303 bucket.distinct_count += 1;
305 self.distinct_count += 1;
306 }
307 } else if !self.buckets.is_empty() {
308 let last = self
310 .buckets
311 .last_mut()
312 .expect("buckets non-empty checked above");
313 last.count += 1;
314 if value.as_str() > last.upper_bound.as_str() {
315 last.upper_bound = value.clone();
316 last.distinct_count += 1;
317 self.distinct_count += 1;
318 }
319 } else {
320 self.buckets.push(HistogramBucket {
322 lower_bound: value.clone(),
323 upper_bound: value.clone(),
324 count: 1,
325 distinct_count: 1,
326 null_count: 0,
327 });
328 self.distinct_count += 1;
329 }
330 }
331 }
332
333 pub fn num_buckets(&self) -> usize {
335 self.buckets.len()
336 }
337}
338
339pub struct DatasetStatistics {
343 histograms: HashMap<String, PredicateHistogram>,
345 total_triples: u64,
347 distinct_subjects: u64,
349 distinct_predicates: u64,
351 distinct_objects: u64,
353 last_updated: Instant,
355 num_buckets: usize,
357}
358
359impl DatasetStatistics {
360 pub fn new() -> Self {
362 Self::with_num_buckets(100)
363 }
364
365 pub fn with_num_buckets(num_buckets: usize) -> Self {
367 Self {
368 histograms: HashMap::new(),
369 total_triples: 0,
370 distinct_subjects: 0,
371 distinct_predicates: 0,
372 distinct_objects: 0,
373 last_updated: Instant::now(),
374 num_buckets: num_buckets.max(1),
375 }
376 }
377
378 pub fn record_triple(&mut self, predicate: &str, object: &str) {
382 self.total_triples += 1;
383
384 let num_buckets = self.num_buckets;
385 let histogram = self
386 .histograms
387 .entry(predicate.to_owned())
388 .or_insert_with(|| PredicateHistogram::new(predicate, num_buckets));
389
390 histogram.update_incremental(&[object.to_owned()]);
391
392 self.distinct_predicates = self.histograms.len() as u64;
393 self.last_updated = Instant::now();
394 }
395
396 pub fn get_histogram(&self, predicate: &str) -> Option<&PredicateHistogram> {
398 self.histograms.get(predicate)
399 }
400
401 pub fn estimate_pattern_cardinality(
410 &self,
411 subject: Option<&str>,
412 predicate: Option<&str>,
413 object: Option<&str>,
414 ) -> u64 {
415 if self.total_triples == 0 {
416 return 0;
417 }
418
419 let subject_selectivity = if subject.is_some() {
421 if self.distinct_subjects > 1 {
422 1.0 / self.distinct_subjects as f64
423 } else {
424 0.001
425 }
426 } else {
427 1.0
428 };
429
430 let base_cardinality: f64 = match (predicate, object) {
431 (None, None) => self.total_triples as f64,
432
433 (Some(pred), None) => {
434 self.histograms
436 .get(pred)
437 .map(|h| h.total_count as f64)
438 .unwrap_or_else(|| {
439 if self.distinct_predicates > 0 {
441 self.total_triples as f64 / self.distinct_predicates as f64
442 } else {
443 self.total_triples as f64 * 0.1
444 }
445 })
446 }
447
448 (Some(pred), Some(obj)) => {
449 match self.histograms.get(pred) {
451 Some(hist) => {
452 let sel = hist.estimate_selectivity_eq(obj);
453 (hist.total_count as f64 * sel).max(1.0)
454 }
455 None => 1.0, }
457 }
458
459 (None, Some(obj)) => {
460 if self.histograms.is_empty() {
462 return 0;
463 }
464 self.histograms
465 .values()
466 .map(|hist| {
467 let sel = hist.estimate_selectivity_eq(obj);
468 hist.total_count as f64 * sel
469 })
470 .sum::<f64>()
471 .max(1.0)
472 }
473 };
474
475 ((base_cardinality * subject_selectivity).ceil() as u64).max(1)
476 }
477
478 pub fn rebuild_histograms(&mut self, values: &[(String, String)]) {
483 self.histograms.clear();
484 self.total_triples = values.len() as u64;
485
486 let mut by_predicate: HashMap<&str, Vec<&String>> = HashMap::new();
488 for (pred, obj) in values {
489 by_predicate.entry(pred.as_str()).or_default().push(obj);
490 }
491
492 let num_buckets = self.num_buckets;
493 for (pred, objs) in &by_predicate {
494 let owned_objs: Vec<String> = objs.iter().map(|s| (*s).clone()).collect();
495 let histogram = PredicateHistogram::build_from_values(pred, &owned_objs, num_buckets);
496 self.histograms.insert(pred.to_string(), histogram);
497 }
498
499 self.distinct_predicates = self.histograms.len() as u64;
501 self.distinct_objects = self.histograms.values().map(|h| h.distinct_count).sum();
504
505 self.last_updated = Instant::now();
506 }
507
508 pub fn total_triples(&self) -> u64 {
510 self.total_triples
511 }
512
513 pub fn set_distinct_subjects(&mut self, count: u64) {
517 self.distinct_subjects = count;
518 }
519
520 pub fn set_distinct_objects(&mut self, count: u64) {
522 self.distinct_objects = count;
523 }
524
525 pub fn last_updated(&self) -> Instant {
527 self.last_updated
528 }
529
530 pub fn predicate_count(&self) -> usize {
532 self.histograms.len()
533 }
534}
535
536impl Default for DatasetStatistics {
537 fn default() -> Self {
538 Self::new()
539 }
540}
541
542#[cfg(test)]
543mod tests {
544 use super::*;
545
546 #[test]
549 fn test_bucket_contains() {
550 let bucket = HistogramBucket {
551 lower_bound: "apple".to_string(),
552 upper_bound: "mango".to_string(),
553 count: 10,
554 distinct_count: 5,
555 null_count: 0,
556 };
557 assert!(bucket.contains("apple"));
558 assert!(bucket.contains("cherry"));
559 assert!(bucket.contains("mango"));
560 assert!(!bucket.contains("aardvark")); assert!(!bucket.contains("zebra")); }
563
564 #[test]
565 fn test_bucket_overlap_fraction_full() {
566 let bucket = HistogramBucket {
567 lower_bound: "b".to_string(),
568 upper_bound: "m".to_string(),
569 count: 100,
570 distinct_count: 50,
571 null_count: 0,
572 };
573 let frac = bucket.overlap_fraction("a", "z");
575 assert!(frac > 0.99, "Expected ~1.0 but got {}", frac);
576 }
577
578 #[test]
579 fn test_bucket_overlap_fraction_none() {
580 let bucket = HistogramBucket {
581 lower_bound: "m".to_string(),
582 upper_bound: "z".to_string(),
583 count: 100,
584 distinct_count: 50,
585 null_count: 0,
586 };
587 let frac = bucket.overlap_fraction("a", "b");
589 assert_eq!(frac, 0.0);
590 }
591
592 #[test]
595 fn test_build_from_empty_values() {
596 let hist = PredicateHistogram::build_from_values("http://example.org/p", &[], 10);
597 assert_eq!(hist.total_count, 0);
598 assert_eq!(hist.buckets.len(), 0);
599 }
600
601 #[test]
602 fn test_build_from_values_single_bucket() {
603 let values: Vec<String> = (0..5).map(|i| format!("value{}", i)).collect();
604 let hist = PredicateHistogram::build_from_values("p", &values, 1);
605 assert_eq!(hist.total_count, 5);
606 assert_eq!(hist.buckets.len(), 1);
607 assert_eq!(hist.buckets[0].count, 5);
608 }
609
610 #[test]
611 fn test_build_from_values_multiple_buckets() {
612 let values: Vec<String> = (0..100).map(|i| format!("value{:03}", i)).collect();
613 let hist = PredicateHistogram::build_from_values("p", &values, 10);
614 assert_eq!(hist.total_count, 100);
615 assert_eq!(hist.distinct_count, 100);
616 assert!(hist.buckets.len() >= 9 && hist.buckets.len() <= 11);
618
619 let sum: u64 = hist.buckets.iter().map(|b| b.count).sum();
621 assert_eq!(sum, 100);
622 }
623
624 #[test]
625 fn test_build_handles_nulls() {
626 let mut values: Vec<String> = (0..10).map(|i| format!("v{}", i)).collect();
627 values.push(String::new()); values.push(String::new()); let hist = PredicateHistogram::build_from_values("p", &values, 5);
630 assert_eq!(hist.null_count, 2);
631 assert_eq!(hist.total_count, 10);
632 }
633
634 #[test]
635 fn test_estimate_selectivity_eq_within_bucket() {
636 let values: Vec<String> = vec![
637 "alpha".to_string(),
638 "beta".to_string(),
639 "gamma".to_string(),
640 "delta".to_string(),
641 "epsilon".to_string(),
642 ];
643 let hist = PredicateHistogram::build_from_values("p", &values, 2);
644 let sel = hist.estimate_selectivity_eq("beta");
646 assert!(sel > 0.0 && sel <= 1.0, "Selectivity {} out of range", sel);
647 }
648
649 #[test]
650 fn test_estimate_selectivity_eq_missing_value() {
651 let values: Vec<String> = (0..10).map(|i| format!("value{}", i)).collect();
652 let hist = PredicateHistogram::build_from_values("p", &values, 5);
653 let sel = hist.estimate_selectivity_eq("zzz_not_in_dataset");
654 assert!(
656 sel < 0.01,
657 "Selectivity for missing value too high: {}",
658 sel
659 );
660 }
661
662 #[test]
663 fn test_estimate_selectivity_range() {
664 let values: Vec<String> = (0..100).map(|i| format!("{:03}", i)).collect();
665 let hist = PredicateHistogram::build_from_values("p", &values, 10);
666 let sel = hist.estimate_selectivity_range("000", "049");
668 assert!(
669 sel > 0.3 && sel < 0.7,
670 "Range selectivity {} unexpected",
671 sel
672 );
673 }
674
675 #[test]
676 fn test_estimate_cardinality_no_filter() {
677 let values: Vec<String> = (0..50).map(|i| format!("v{}", i)).collect();
678 let hist = PredicateHistogram::build_from_values("p", &values, 5);
679 let card = hist.estimate_cardinality(1000, None);
680 assert_eq!(card, 50);
681 }
682
683 #[test]
684 fn test_estimate_cardinality_with_filter() {
685 let values: Vec<String> = (0..100).map(|i| format!("{:03}", i)).collect();
686 let hist = PredicateHistogram::build_from_values("p", &values, 10);
687 let card = hist.estimate_cardinality(1000, Some("050"));
688 assert!(card >= 1);
690 }
691
692 #[test]
693 fn test_update_incremental() {
694 let initial: Vec<String> = (0..10).map(|i| format!("v{}", i)).collect();
695 let mut hist = PredicateHistogram::build_from_values("p", &initial, 3);
696 let before_count = hist.total_count;
697
698 let new_vals: Vec<String> = vec!["newA".to_string(), "newB".to_string()];
699 hist.update_incremental(&new_vals);
700
701 assert_eq!(hist.total_count, before_count + 2);
702 }
703
704 #[test]
705 fn test_update_incremental_null() {
706 let initial: Vec<String> = (0..5).map(|i| format!("v{}", i)).collect();
707 let mut hist = PredicateHistogram::build_from_values("p", &initial, 2);
708 hist.update_incremental(&[String::new()]);
709 assert_eq!(hist.null_count, 1);
710 }
711
712 #[test]
715 fn test_dataset_statistics_new() {
716 let stats = DatasetStatistics::new();
717 assert_eq!(stats.total_triples(), 0);
718 assert_eq!(stats.predicate_count(), 0);
719 }
720
721 #[test]
722 fn test_dataset_record_triple() {
723 let mut stats = DatasetStatistics::new();
724 stats.record_triple("http://example.org/age", "42");
725 stats.record_triple("http://example.org/age", "25");
726 stats.record_triple("http://example.org/name", "Alice");
727
728 assert_eq!(stats.total_triples(), 3);
729 assert_eq!(stats.predicate_count(), 2);
730 assert!(stats.get_histogram("http://example.org/age").is_some());
731 }
732
733 #[test]
734 fn test_dataset_estimate_pattern_cardinality_no_bindings() {
735 let mut stats = DatasetStatistics::new();
736 for i in 0..20 {
737 stats.record_triple("http://p", &format!("obj{}", i));
738 }
739 let card = stats.estimate_pattern_cardinality(None, None, None);
740 assert_eq!(card, 20);
741 }
742
743 #[test]
744 fn test_dataset_estimate_pattern_cardinality_known_predicate() {
745 let mut stats = DatasetStatistics::new();
746 for i in 0..30 {
747 stats.record_triple("http://p/name", &format!("name{}", i));
748 }
749 for i in 0..10 {
750 stats.record_triple("http://p/age", &format!("{}", i));
751 }
752 let card = stats.estimate_pattern_cardinality(None, Some("http://p/name"), None);
753 assert_eq!(card, 30);
754 }
755
756 #[test]
757 fn test_dataset_estimate_pattern_cardinality_eq_filter() {
758 let mut stats = DatasetStatistics::with_num_buckets(5);
759 for i in 0..100 {
760 stats.record_triple("http://p/color", &format!("color{:03}", i));
761 }
762 let card =
763 stats.estimate_pattern_cardinality(None, Some("http://p/color"), Some("color050"));
764 assert!(card >= 1, "Equality filter should return at least 1");
765 }
766
767 #[test]
768 fn test_dataset_rebuild_histograms() {
769 let mut stats = DatasetStatistics::new();
770 let pairs: Vec<(String, String)> = (0..200)
771 .map(|i| ("http://p".to_string(), format!("v{:04}", i)))
772 .collect();
773 stats.rebuild_histograms(&pairs);
774
775 assert_eq!(stats.total_triples(), 200);
776 assert_eq!(stats.predicate_count(), 1);
777
778 let hist = stats
779 .get_histogram("http://p")
780 .expect("histogram must exist");
781 assert_eq!(hist.total_count, 200);
782 }
783
784 #[test]
785 fn test_dataset_set_distinct_subjects() {
786 let mut stats = DatasetStatistics::new();
787 stats.record_triple("p", "o");
788 stats.set_distinct_subjects(500);
789 let with_subject = stats.estimate_pattern_cardinality(Some("s"), Some("p"), None);
791 let without_subject = stats.estimate_pattern_cardinality(None, Some("p"), None);
792 assert!(with_subject <= without_subject);
793 }
794
795 #[test]
796 fn test_dataset_unknown_predicate_estimate() {
797 let mut stats = DatasetStatistics::new();
798 for i in 0..90 {
799 stats.record_triple("http://p/known", &format!("v{}", i));
800 }
801 let card = stats.estimate_pattern_cardinality(None, Some("http://p/unknown"), None);
803 assert!(card >= 1);
804 }
805}