1use super::histogram::Histogram;
9use grafeo_common::types::Value;
10use std::collections::HashMap;
11
12#[derive(Debug, Clone, Default)]
17pub struct RdfStatistics {
18 pub total_triples: u64,
20 pub subject_count: u64,
22 pub predicate_count: u64,
24 pub object_count: u64,
26
27 pub predicates: HashMap<String, PredicateStatistics>,
29
30 pub subject_histogram: Option<Histogram>,
32 pub object_histogram: Option<Histogram>,
34
35 pub index_stats: IndexStatistics,
37}
38
39impl RdfStatistics {
40 pub fn new() -> Self {
42 Self::default()
43 }
44
45 pub fn update_from_counts(
47 &mut self,
48 total_triples: u64,
49 subject_count: u64,
50 predicate_count: u64,
51 object_count: u64,
52 ) {
53 self.total_triples = total_triples;
54 self.subject_count = subject_count;
55 self.predicate_count = predicate_count;
56 self.object_count = object_count;
57 }
58
59 pub fn update_predicate(&mut self, predicate: &str, stats: PredicateStatistics) {
61 self.predicates.insert(predicate.to_string(), stats);
62 }
63
64 pub fn get_predicate(&self, predicate: &str) -> Option<&PredicateStatistics> {
66 self.predicates.get(predicate)
67 }
68
69 pub fn estimate_triple_pattern_cardinality(
76 &self,
77 subject_bound: bool,
78 predicate_bound: Option<&str>,
79 object_bound: bool,
80 ) -> f64 {
81 if self.total_triples == 0 {
82 return 0.0;
83 }
84
85 let base = self.total_triples as f64;
86
87 match (subject_bound, predicate_bound, object_bound) {
88 (true, Some(_), true) => 1.0,
90
91 (true, Some(pred), false) => {
93 if let Some(stats) = self.predicates.get(pred) {
94 stats.avg_objects_per_subject()
96 } else {
97 10.0
99 }
100 }
101
102 (true, None, true) => {
104 self.predicate_count as f64
106 }
107
108 (true, None, false) => {
110 base / self.subject_count.max(1) as f64
112 }
113
114 (false, Some(pred), false) => {
116 if let Some(stats) = self.predicates.get(pred) {
117 stats.triple_count as f64
118 } else {
119 base / self.predicate_count.max(1) as f64
120 }
121 }
122
123 (false, Some(pred), true) => {
125 if let Some(stats) = self.predicates.get(pred) {
126 stats.avg_subjects_per_object()
128 } else {
129 10.0
130 }
131 }
132
133 (false, None, true) => {
135 base / self.object_count.max(1) as f64
137 }
138
139 (false, None, false) => base,
141 }
142 }
143
144 pub fn estimate_join_selectivity(
146 &self,
147 var_position1: TriplePosition,
148 var_position2: TriplePosition,
149 ) -> f64 {
150 let domain_size = match (var_position1, var_position2) {
151 (TriplePosition::Subject, TriplePosition::Subject) => self.subject_count,
152 (TriplePosition::Subject, TriplePosition::Object)
153 | (TriplePosition::Object, TriplePosition::Subject) => {
154 self.subject_count.max(self.object_count)
156 }
157 (TriplePosition::Object, TriplePosition::Object) => self.object_count,
158 _ => {
159 self.predicate_count
161 }
162 };
163
164 if domain_size == 0 {
165 return 1.0;
166 }
167
168 1.0 / domain_size as f64
170 }
171
172 pub fn estimate_filter_selectivity(&self, predicate_iri: Option<&str>) -> f64 {
174 if let Some(pred) = predicate_iri
176 && let Some(stats) = self.predicates.get(pred)
177 {
178 if let Some(ref _hist) = stats.object_histogram {
180 return 0.33;
182 }
183 }
184 0.33 }
186}
187
188#[derive(Debug, Clone, Copy, PartialEq, Eq)]
190#[non_exhaustive]
191pub enum TriplePosition {
192 Subject,
194 Predicate,
196 Object,
198}
199
200#[derive(Debug, Clone)]
202pub struct PredicateStatistics {
203 pub triple_count: u64,
205 pub distinct_subjects: u64,
207 pub distinct_objects: u64,
209 pub is_functional: bool,
211 pub is_inverse_functional: bool,
213 pub object_type_distribution: HashMap<String, u64>,
215 pub object_histogram: Option<Histogram>,
217}
218
219impl PredicateStatistics {
220 pub fn new(triple_count: u64, distinct_subjects: u64, distinct_objects: u64) -> Self {
222 let is_functional = triple_count > 0 && triple_count == distinct_subjects;
223 let is_inverse_functional = triple_count > 0 && triple_count == distinct_objects;
224
225 Self {
226 triple_count,
227 distinct_subjects,
228 distinct_objects,
229 is_functional,
230 is_inverse_functional,
231 object_type_distribution: HashMap::new(),
232 object_histogram: None,
233 }
234 }
235
236 pub fn with_functional(mut self, functional: bool) -> Self {
238 self.is_functional = functional;
239 self
240 }
241
242 pub fn with_object_histogram(mut self, histogram: Histogram) -> Self {
244 self.object_histogram = Some(histogram);
245 self
246 }
247
248 pub fn with_object_types(mut self, types: HashMap<String, u64>) -> Self {
250 self.object_type_distribution = types;
251 self
252 }
253
254 pub fn avg_objects_per_subject(&self) -> f64 {
256 if self.distinct_subjects == 0 {
257 return 0.0;
258 }
259 self.triple_count as f64 / self.distinct_subjects as f64
260 }
261
262 pub fn avg_subjects_per_object(&self) -> f64 {
264 if self.distinct_objects == 0 {
265 return 0.0;
266 }
267 self.triple_count as f64 / self.distinct_objects as f64
268 }
269
270 pub fn object_equality_selectivity(&self, _value: &Value) -> f64 {
272 if let Some(ref hist) = self.object_histogram {
273 return hist.estimate_equality_selectivity(_value);
275 }
276
277 if self.distinct_objects == 0 {
279 return 1.0;
280 }
281 1.0 / self.distinct_objects as f64
282 }
283}
284
285#[derive(Debug, Clone, Default)]
290pub struct IndexStatistics {
291 pub spo_lookup_cost: f64,
293 pub pos_lookup_cost: f64,
295 pub osp_lookup_cost: f64,
297 pub has_osp_index: bool,
299}
300
301impl IndexStatistics {
302 pub fn new() -> Self {
304 Self {
305 spo_lookup_cost: 1.0,
306 pos_lookup_cost: 1.5,
307 osp_lookup_cost: 2.0,
308 has_osp_index: true,
309 }
310 }
311
312 pub fn estimate_pattern_cost(
318 &self,
319 subject_bound: bool,
320 predicate_bound: bool,
321 object_bound: bool,
322 ) -> f64 {
323 let bound_count =
324 u8::from(subject_bound) + u8::from(predicate_bound) + u8::from(object_bound);
325 match bound_count {
326 2..=3 => self.spo_lookup_cost,
328 1 => self.pos_lookup_cost,
330 _ => 10.0,
332 }
333 }
334}
335
336#[derive(Default)]
341pub struct RdfStatisticsCollector {
342 triple_count: u64,
344 subjects: HashMap<String, u64>,
346 predicates: HashMap<String, PredicateCollector>,
348 objects: HashMap<String, u64>,
350}
351
352#[derive(Default)]
354struct PredicateCollector {
355 count: u64,
356 subjects: HashMap<String, u64>,
357 objects: HashMap<String, u64>,
358}
359
360impl RdfStatisticsCollector {
361 pub fn new() -> Self {
363 Self::default()
364 }
365
366 pub fn record_triple(&mut self, subject: &str, predicate: &str, object: &str) {
368 self.triple_count += 1;
369
370 *self.subjects.entry(subject.to_string()).or_insert(0) += 1;
371 *self.objects.entry(object.to_string()).or_insert(0) += 1;
372
373 let pred_stats = self.predicates.entry(predicate.to_string()).or_default();
374 pred_stats.count += 1;
375 *pred_stats.subjects.entry(subject.to_string()).or_insert(0) += 1;
376 *pred_stats.objects.entry(object.to_string()).or_insert(0) += 1;
377 }
378
379 pub fn build(self) -> RdfStatistics {
381 let mut stats = RdfStatistics::new();
382
383 stats.total_triples = self.triple_count;
384 stats.subject_count = self.subjects.len() as u64;
385 stats.predicate_count = self.predicates.len() as u64;
386 stats.object_count = self.objects.len() as u64;
387
388 for (pred, collector) in self.predicates {
389 let pred_stats = PredicateStatistics::new(
390 collector.count,
391 collector.subjects.len() as u64,
392 collector.objects.len() as u64,
393 );
394 stats.predicates.insert(pred, pred_stats);
395 }
396
397 stats.index_stats = IndexStatistics::new();
398
399 stats
400 }
401}
402
403#[cfg(test)]
404mod tests {
405 use super::*;
406
407 #[test]
408 fn test_rdf_statistics_basic() {
409 let mut stats = RdfStatistics::new();
410 stats.update_from_counts(1000, 100, 50, 200);
411
412 let card = stats.estimate_triple_pattern_cardinality(true, None, false);
414 assert!(card > 0.0);
415
416 let full_card = stats.estimate_triple_pattern_cardinality(false, None, false);
418 assert_eq!(full_card, 1000.0);
419 }
420
421 #[test]
422 fn test_predicate_statistics() {
423 let pred_stats = PredicateStatistics::new(100, 50, 80);
424
425 assert_eq!(pred_stats.avg_objects_per_subject(), 2.0);
426 assert!(!pred_stats.is_functional);
427 }
428
429 #[test]
430 fn test_functional_predicate() {
431 let pred_stats = PredicateStatistics::new(100, 100, 100);
432
433 assert!(pred_stats.is_functional);
434 assert!(pred_stats.is_inverse_functional);
435 assert_eq!(pred_stats.avg_objects_per_subject(), 1.0);
436 }
437
438 #[test]
439 fn test_join_selectivity() {
440 let mut stats = RdfStatistics::new();
441 stats.update_from_counts(1000, 100, 50, 200);
442
443 let sel = stats.estimate_join_selectivity(TriplePosition::Subject, TriplePosition::Subject);
444 assert_eq!(sel, 0.01); let sel = stats.estimate_join_selectivity(TriplePosition::Subject, TriplePosition::Object);
447 assert_eq!(sel, 1.0 / 200.0); }
449
450 #[test]
451 fn test_statistics_collector() {
452 let mut collector = RdfStatisticsCollector::new();
453
454 collector.record_triple("alix", "knows", "gus");
455 collector.record_triple("alix", "name", "Alix");
456 collector.record_triple("gus", "name", "Gus");
457 collector.record_triple("gus", "knows", "vincent");
458
459 let stats = collector.build();
460
461 assert_eq!(stats.total_triples, 4);
462 assert_eq!(stats.subject_count, 2); assert_eq!(stats.predicate_count, 2); assert_eq!(stats.object_count, 4); }
466
467 #[test]
468 fn test_pattern_cost_estimation() {
469 let index_stats = IndexStatistics::new();
470
471 let cost = index_stats.estimate_pattern_cost(true, true, false);
473 assert_eq!(cost, 1.0);
474
475 let cost = index_stats.estimate_pattern_cost(true, true, true);
477 assert_eq!(cost, 1.0);
478
479 let cost = index_stats.estimate_pattern_cost(true, false, false);
481 assert_eq!(cost, 1.5);
482
483 let cost = index_stats.estimate_pattern_cost(false, true, false);
484 assert_eq!(cost, 1.5);
485
486 let cost = index_stats.estimate_pattern_cost(false, false, false);
488 assert_eq!(cost, 10.0);
489 }
490}