1use super::histogram::Histogram;
9use grafeo_common::types::Value;
10use std::collections::HashMap;
11
12#[derive(Debug, Clone, Default)]
17pub struct RdfStatistics {
18 pub total_triples: u64,
20 pub subject_count: u64,
22 pub predicate_count: u64,
24 pub object_count: u64,
26
27 pub predicates: HashMap<String, PredicateStatistics>,
29
30 pub subject_histogram: Option<Histogram>,
32 pub object_histogram: Option<Histogram>,
34
35 pub index_stats: IndexStatistics,
37}
38
39impl RdfStatistics {
40 pub fn new() -> Self {
42 Self::default()
43 }
44
45 pub fn update_from_counts(
47 &mut self,
48 total_triples: u64,
49 subject_count: u64,
50 predicate_count: u64,
51 object_count: u64,
52 ) {
53 self.total_triples = total_triples;
54 self.subject_count = subject_count;
55 self.predicate_count = predicate_count;
56 self.object_count = object_count;
57 }
58
59 pub fn update_predicate(&mut self, predicate: &str, stats: PredicateStatistics) {
61 self.predicates.insert(predicate.to_string(), stats);
62 }
63
64 pub fn get_predicate(&self, predicate: &str) -> Option<&PredicateStatistics> {
66 self.predicates.get(predicate)
67 }
68
69 pub fn estimate_triple_pattern_cardinality(
76 &self,
77 subject_bound: bool,
78 predicate_bound: Option<&str>,
79 object_bound: bool,
80 ) -> f64 {
81 if self.total_triples == 0 {
82 return 0.0;
83 }
84
85 let base = self.total_triples as f64;
86
87 match (subject_bound, predicate_bound, object_bound) {
88 (true, Some(_), true) => 1.0,
90
91 (true, Some(pred), false) => {
93 if let Some(stats) = self.predicates.get(pred) {
94 stats.avg_objects_per_subject()
96 } else {
97 10.0
99 }
100 }
101
102 (true, None, true) => {
104 self.predicate_count as f64
106 }
107
108 (true, None, false) => {
110 base / self.subject_count.max(1) as f64
112 }
113
114 (false, Some(pred), false) => {
116 if let Some(stats) = self.predicates.get(pred) {
117 stats.triple_count as f64
118 } else {
119 base / self.predicate_count.max(1) as f64
120 }
121 }
122
123 (false, Some(pred), true) => {
125 if let Some(stats) = self.predicates.get(pred) {
126 stats.avg_subjects_per_object()
128 } else {
129 10.0
130 }
131 }
132
133 (false, None, true) => {
135 base / self.object_count.max(1) as f64
137 }
138
139 (false, None, false) => base,
141 }
142 }
143
144 pub fn estimate_join_selectivity(
146 &self,
147 var_position1: TriplePosition,
148 var_position2: TriplePosition,
149 ) -> f64 {
150 let domain_size = match (var_position1, var_position2) {
151 (TriplePosition::Subject, TriplePosition::Subject) => self.subject_count,
152 (TriplePosition::Subject, TriplePosition::Object)
153 | (TriplePosition::Object, TriplePosition::Subject) => {
154 self.subject_count.max(self.object_count)
156 }
157 (TriplePosition::Object, TriplePosition::Object) => self.object_count,
158 _ => {
159 self.predicate_count
161 }
162 };
163
164 if domain_size == 0 {
165 return 1.0;
166 }
167
168 1.0 / domain_size as f64
170 }
171
172 pub fn estimate_filter_selectivity(&self, predicate_iri: Option<&str>) -> f64 {
174 if let Some(pred) = predicate_iri {
176 if let Some(stats) = self.predicates.get(pred) {
177 if let Some(ref _hist) = stats.object_histogram {
179 return 0.33;
181 }
182 }
183 }
184 0.33 }
186}
187
188#[derive(Debug, Clone, Copy, PartialEq, Eq)]
190pub enum TriplePosition {
191 Subject,
193 Predicate,
195 Object,
197}
198
199#[derive(Debug, Clone)]
201pub struct PredicateStatistics {
202 pub triple_count: u64,
204 pub distinct_subjects: u64,
206 pub distinct_objects: u64,
208 pub is_functional: bool,
210 pub is_inverse_functional: bool,
212 pub object_type_distribution: HashMap<String, u64>,
214 pub object_histogram: Option<Histogram>,
216}
217
218impl PredicateStatistics {
219 pub fn new(triple_count: u64, distinct_subjects: u64, distinct_objects: u64) -> Self {
221 let is_functional = triple_count > 0 && triple_count == distinct_subjects;
222 let is_inverse_functional = triple_count > 0 && triple_count == distinct_objects;
223
224 Self {
225 triple_count,
226 distinct_subjects,
227 distinct_objects,
228 is_functional,
229 is_inverse_functional,
230 object_type_distribution: HashMap::new(),
231 object_histogram: None,
232 }
233 }
234
235 pub fn with_functional(mut self, functional: bool) -> Self {
237 self.is_functional = functional;
238 self
239 }
240
241 pub fn with_object_histogram(mut self, histogram: Histogram) -> Self {
243 self.object_histogram = Some(histogram);
244 self
245 }
246
247 pub fn with_object_types(mut self, types: HashMap<String, u64>) -> Self {
249 self.object_type_distribution = types;
250 self
251 }
252
253 pub fn avg_objects_per_subject(&self) -> f64 {
255 if self.distinct_subjects == 0 {
256 return 0.0;
257 }
258 self.triple_count as f64 / self.distinct_subjects as f64
259 }
260
261 pub fn avg_subjects_per_object(&self) -> f64 {
263 if self.distinct_objects == 0 {
264 return 0.0;
265 }
266 self.triple_count as f64 / self.distinct_objects as f64
267 }
268
269 pub fn object_equality_selectivity(&self, _value: &Value) -> f64 {
271 if let Some(ref hist) = self.object_histogram {
272 return hist.estimate_equality_selectivity(_value);
274 }
275
276 if self.distinct_objects == 0 {
278 return 1.0;
279 }
280 1.0 / self.distinct_objects as f64
281 }
282}
283
284#[derive(Debug, Clone, Default)]
289pub struct IndexStatistics {
290 pub spo_lookup_cost: f64,
292 pub pos_lookup_cost: f64,
294 pub osp_lookup_cost: f64,
296 pub has_osp_index: bool,
298}
299
300impl IndexStatistics {
301 pub fn new() -> Self {
303 Self {
304 spo_lookup_cost: 1.0,
305 pos_lookup_cost: 1.5,
306 osp_lookup_cost: 2.0,
307 has_osp_index: true,
308 }
309 }
310
311 pub fn estimate_pattern_cost(
313 &self,
314 subject_bound: bool,
315 predicate_bound: bool,
316 object_bound: bool,
317 ) -> f64 {
318 match (subject_bound, predicate_bound, object_bound) {
319 (true, _, _) => self.spo_lookup_cost,
321 (false, true, _) => self.pos_lookup_cost,
323 (false, false, true) if self.has_osp_index => self.osp_lookup_cost,
325 _ => 10.0, }
328 }
329}
330
331#[derive(Default)]
336pub struct RdfStatisticsCollector {
337 triple_count: u64,
339 subjects: HashMap<String, u64>,
341 predicates: HashMap<String, PredicateCollector>,
343 objects: HashMap<String, u64>,
345}
346
347#[derive(Default)]
349struct PredicateCollector {
350 count: u64,
351 subjects: HashMap<String, u64>,
352 objects: HashMap<String, u64>,
353}
354
355impl RdfStatisticsCollector {
356 pub fn new() -> Self {
358 Self::default()
359 }
360
361 pub fn record_triple(&mut self, subject: &str, predicate: &str, object: &str) {
363 self.triple_count += 1;
364
365 *self.subjects.entry(subject.to_string()).or_insert(0) += 1;
366 *self.objects.entry(object.to_string()).or_insert(0) += 1;
367
368 let pred_stats = self.predicates.entry(predicate.to_string()).or_default();
369 pred_stats.count += 1;
370 *pred_stats.subjects.entry(subject.to_string()).or_insert(0) += 1;
371 *pred_stats.objects.entry(object.to_string()).or_insert(0) += 1;
372 }
373
374 pub fn build(self) -> RdfStatistics {
376 let mut stats = RdfStatistics::new();
377
378 stats.total_triples = self.triple_count;
379 stats.subject_count = self.subjects.len() as u64;
380 stats.predicate_count = self.predicates.len() as u64;
381 stats.object_count = self.objects.len() as u64;
382
383 for (pred, collector) in self.predicates {
384 let pred_stats = PredicateStatistics::new(
385 collector.count,
386 collector.subjects.len() as u64,
387 collector.objects.len() as u64,
388 );
389 stats.predicates.insert(pred, pred_stats);
390 }
391
392 stats.index_stats = IndexStatistics::new();
393
394 stats
395 }
396}
397
398#[cfg(test)]
399mod tests {
400 use super::*;
401
402 #[test]
403 fn test_rdf_statistics_basic() {
404 let mut stats = RdfStatistics::new();
405 stats.update_from_counts(1000, 100, 50, 200);
406
407 let card = stats.estimate_triple_pattern_cardinality(true, None, false);
409 assert!(card > 0.0);
410
411 let full_card = stats.estimate_triple_pattern_cardinality(false, None, false);
413 assert_eq!(full_card, 1000.0);
414 }
415
416 #[test]
417 fn test_predicate_statistics() {
418 let pred_stats = PredicateStatistics::new(100, 50, 80);
419
420 assert_eq!(pred_stats.avg_objects_per_subject(), 2.0);
421 assert!(!pred_stats.is_functional);
422 }
423
424 #[test]
425 fn test_functional_predicate() {
426 let pred_stats = PredicateStatistics::new(100, 100, 100);
427
428 assert!(pred_stats.is_functional);
429 assert!(pred_stats.is_inverse_functional);
430 assert_eq!(pred_stats.avg_objects_per_subject(), 1.0);
431 }
432
433 #[test]
434 fn test_join_selectivity() {
435 let mut stats = RdfStatistics::new();
436 stats.update_from_counts(1000, 100, 50, 200);
437
438 let sel = stats.estimate_join_selectivity(TriplePosition::Subject, TriplePosition::Subject);
439 assert_eq!(sel, 0.01); let sel = stats.estimate_join_selectivity(TriplePosition::Subject, TriplePosition::Object);
442 assert_eq!(sel, 1.0 / 200.0); }
444
445 #[test]
446 fn test_statistics_collector() {
447 let mut collector = RdfStatisticsCollector::new();
448
449 collector.record_triple("alice", "knows", "bob");
450 collector.record_triple("alice", "name", "Alice");
451 collector.record_triple("bob", "name", "Bob");
452 collector.record_triple("bob", "knows", "charlie");
453
454 let stats = collector.build();
455
456 assert_eq!(stats.total_triples, 4);
457 assert_eq!(stats.subject_count, 2); assert_eq!(stats.predicate_count, 2); assert_eq!(stats.object_count, 4); }
461
462 #[test]
463 fn test_pattern_cost_estimation() {
464 let index_stats = IndexStatistics::new();
465
466 let cost = index_stats.estimate_pattern_cost(true, false, false);
468 assert_eq!(cost, 1.0);
469
470 let cost = index_stats.estimate_pattern_cost(false, true, false);
472 assert_eq!(cost, 1.5);
473
474 let cost = index_stats.estimate_pattern_cost(false, false, false);
476 assert_eq!(cost, 10.0);
477 }
478}