1use super::histogram::Histogram;
8use graphos_common::types::Value;
9use std::collections::HashMap;
10
11#[derive(Debug, Clone, Default)]
13pub struct RdfStatistics {
14 pub total_triples: u64,
16 pub subject_count: u64,
18 pub predicate_count: u64,
20 pub object_count: u64,
22
23 pub predicates: HashMap<String, PredicateStatistics>,
25
26 pub subject_histogram: Option<Histogram>,
28 pub object_histogram: Option<Histogram>,
30
31 pub index_stats: IndexStatistics,
33}
34
35impl RdfStatistics {
36 pub fn new() -> Self {
38 Self::default()
39 }
40
41 pub fn update_from_counts(
43 &mut self,
44 total_triples: u64,
45 subject_count: u64,
46 predicate_count: u64,
47 object_count: u64,
48 ) {
49 self.total_triples = total_triples;
50 self.subject_count = subject_count;
51 self.predicate_count = predicate_count;
52 self.object_count = object_count;
53 }
54
55 pub fn update_predicate(&mut self, predicate: &str, stats: PredicateStatistics) {
57 self.predicates.insert(predicate.to_string(), stats);
58 }
59
60 pub fn get_predicate(&self, predicate: &str) -> Option<&PredicateStatistics> {
62 self.predicates.get(predicate)
63 }
64
65 pub fn estimate_triple_pattern_cardinality(
72 &self,
73 subject_bound: bool,
74 predicate_bound: Option<&str>,
75 object_bound: bool,
76 ) -> f64 {
77 if self.total_triples == 0 {
78 return 0.0;
79 }
80
81 let base = self.total_triples as f64;
82
83 match (subject_bound, predicate_bound, object_bound) {
84 (true, Some(_), true) => 1.0,
86
87 (true, Some(pred), false) => {
89 if let Some(stats) = self.predicates.get(pred) {
90 stats.avg_objects_per_subject()
92 } else {
93 10.0
95 }
96 }
97
98 (true, None, true) => {
100 self.predicate_count as f64
102 }
103
104 (true, None, false) => {
106 base / self.subject_count.max(1) as f64
108 }
109
110 (false, Some(pred), false) => {
112 if let Some(stats) = self.predicates.get(pred) {
113 stats.triple_count as f64
114 } else {
115 base / self.predicate_count.max(1) as f64
116 }
117 }
118
119 (false, Some(pred), true) => {
121 if let Some(stats) = self.predicates.get(pred) {
122 stats.avg_subjects_per_object()
124 } else {
125 10.0
126 }
127 }
128
129 (false, None, true) => {
131 base / self.object_count.max(1) as f64
133 }
134
135 (false, None, false) => base,
137 }
138 }
139
140 pub fn estimate_join_selectivity(
142 &self,
143 var_position1: TriplePosition,
144 var_position2: TriplePosition,
145 ) -> f64 {
146 let domain_size = match (var_position1, var_position2) {
147 (TriplePosition::Subject, TriplePosition::Subject) => self.subject_count,
148 (TriplePosition::Subject, TriplePosition::Object)
149 | (TriplePosition::Object, TriplePosition::Subject) => {
150 self.subject_count.max(self.object_count)
152 }
153 (TriplePosition::Object, TriplePosition::Object) => self.object_count,
154 _ => {
155 self.predicate_count
157 }
158 };
159
160 if domain_size == 0 {
161 return 1.0;
162 }
163
164 1.0 / domain_size as f64
166 }
167
168 pub fn estimate_filter_selectivity(&self, predicate_iri: Option<&str>) -> f64 {
170 if let Some(pred) = predicate_iri {
172 if let Some(stats) = self.predicates.get(pred) {
173 if let Some(ref _hist) = stats.object_histogram {
175 return 0.33;
177 }
178 }
179 }
180 0.33 }
182}
183
184#[derive(Debug, Clone, Copy, PartialEq, Eq)]
186pub enum TriplePosition {
187 Subject,
189 Predicate,
191 Object,
193}
194
195#[derive(Debug, Clone)]
197pub struct PredicateStatistics {
198 pub triple_count: u64,
200 pub distinct_subjects: u64,
202 pub distinct_objects: u64,
204 pub is_functional: bool,
206 pub is_inverse_functional: bool,
208 pub object_type_distribution: HashMap<String, u64>,
210 pub object_histogram: Option<Histogram>,
212}
213
214impl PredicateStatistics {
215 pub fn new(triple_count: u64, distinct_subjects: u64, distinct_objects: u64) -> Self {
217 let is_functional = triple_count > 0 && triple_count == distinct_subjects;
218 let is_inverse_functional = triple_count > 0 && triple_count == distinct_objects;
219
220 Self {
221 triple_count,
222 distinct_subjects,
223 distinct_objects,
224 is_functional,
225 is_inverse_functional,
226 object_type_distribution: HashMap::new(),
227 object_histogram: None,
228 }
229 }
230
231 pub fn with_functional(mut self, functional: bool) -> Self {
233 self.is_functional = functional;
234 self
235 }
236
237 pub fn with_object_histogram(mut self, histogram: Histogram) -> Self {
239 self.object_histogram = Some(histogram);
240 self
241 }
242
243 pub fn with_object_types(mut self, types: HashMap<String, u64>) -> Self {
245 self.object_type_distribution = types;
246 self
247 }
248
249 pub fn avg_objects_per_subject(&self) -> f64 {
251 if self.distinct_subjects == 0 {
252 return 0.0;
253 }
254 self.triple_count as f64 / self.distinct_subjects as f64
255 }
256
257 pub fn avg_subjects_per_object(&self) -> f64 {
259 if self.distinct_objects == 0 {
260 return 0.0;
261 }
262 self.triple_count as f64 / self.distinct_objects as f64
263 }
264
265 pub fn object_equality_selectivity(&self, _value: &Value) -> f64 {
267 if let Some(ref hist) = self.object_histogram {
268 return hist.estimate_equality_selectivity(_value);
270 }
271
272 if self.distinct_objects == 0 {
274 return 1.0;
275 }
276 1.0 / self.distinct_objects as f64
277 }
278}
279
280#[derive(Debug, Clone, Default)]
282pub struct IndexStatistics {
283 pub spo_lookup_cost: f64,
285 pub pos_lookup_cost: f64,
287 pub osp_lookup_cost: f64,
289 pub has_osp_index: bool,
291}
292
293impl IndexStatistics {
294 pub fn new() -> Self {
296 Self {
297 spo_lookup_cost: 1.0,
298 pos_lookup_cost: 1.5,
299 osp_lookup_cost: 2.0,
300 has_osp_index: true,
301 }
302 }
303
304 pub fn estimate_pattern_cost(
306 &self,
307 subject_bound: bool,
308 predicate_bound: bool,
309 object_bound: bool,
310 ) -> f64 {
311 match (subject_bound, predicate_bound, object_bound) {
312 (true, _, _) => self.spo_lookup_cost,
314 (false, true, _) => self.pos_lookup_cost,
316 (false, false, true) if self.has_osp_index => self.osp_lookup_cost,
318 _ => 10.0, }
321 }
322}
323
324#[derive(Default)]
326pub struct RdfStatisticsCollector {
327 triple_count: u64,
329 subjects: HashMap<String, u64>,
331 predicates: HashMap<String, PredicateCollector>,
333 objects: HashMap<String, u64>,
335}
336
337#[derive(Default)]
339struct PredicateCollector {
340 count: u64,
341 subjects: HashMap<String, u64>,
342 objects: HashMap<String, u64>,
343}
344
345impl RdfStatisticsCollector {
346 pub fn new() -> Self {
348 Self::default()
349 }
350
351 pub fn record_triple(&mut self, subject: &str, predicate: &str, object: &str) {
353 self.triple_count += 1;
354
355 *self.subjects.entry(subject.to_string()).or_insert(0) += 1;
356 *self.objects.entry(object.to_string()).or_insert(0) += 1;
357
358 let pred_stats = self.predicates.entry(predicate.to_string()).or_default();
359 pred_stats.count += 1;
360 *pred_stats.subjects.entry(subject.to_string()).or_insert(0) += 1;
361 *pred_stats.objects.entry(object.to_string()).or_insert(0) += 1;
362 }
363
364 pub fn build(self) -> RdfStatistics {
366 let mut stats = RdfStatistics::new();
367
368 stats.total_triples = self.triple_count;
369 stats.subject_count = self.subjects.len() as u64;
370 stats.predicate_count = self.predicates.len() as u64;
371 stats.object_count = self.objects.len() as u64;
372
373 for (pred, collector) in self.predicates {
374 let pred_stats = PredicateStatistics::new(
375 collector.count,
376 collector.subjects.len() as u64,
377 collector.objects.len() as u64,
378 );
379 stats.predicates.insert(pred, pred_stats);
380 }
381
382 stats.index_stats = IndexStatistics::new();
383
384 stats
385 }
386}
387
388#[cfg(test)]
389mod tests {
390 use super::*;
391
392 #[test]
393 fn test_rdf_statistics_basic() {
394 let mut stats = RdfStatistics::new();
395 stats.update_from_counts(1000, 100, 50, 200);
396
397 let card = stats.estimate_triple_pattern_cardinality(true, None, false);
399 assert!(card > 0.0);
400
401 let full_card = stats.estimate_triple_pattern_cardinality(false, None, false);
403 assert_eq!(full_card, 1000.0);
404 }
405
406 #[test]
407 fn test_predicate_statistics() {
408 let pred_stats = PredicateStatistics::new(100, 50, 80);
409
410 assert_eq!(pred_stats.avg_objects_per_subject(), 2.0);
411 assert!(!pred_stats.is_functional);
412 }
413
414 #[test]
415 fn test_functional_predicate() {
416 let pred_stats = PredicateStatistics::new(100, 100, 100);
417
418 assert!(pred_stats.is_functional);
419 assert!(pred_stats.is_inverse_functional);
420 assert_eq!(pred_stats.avg_objects_per_subject(), 1.0);
421 }
422
423 #[test]
424 fn test_join_selectivity() {
425 let mut stats = RdfStatistics::new();
426 stats.update_from_counts(1000, 100, 50, 200);
427
428 let sel = stats.estimate_join_selectivity(TriplePosition::Subject, TriplePosition::Subject);
429 assert_eq!(sel, 0.01); let sel = stats.estimate_join_selectivity(TriplePosition::Subject, TriplePosition::Object);
432 assert_eq!(sel, 1.0 / 200.0); }
434
435 #[test]
436 fn test_statistics_collector() {
437 let mut collector = RdfStatisticsCollector::new();
438
439 collector.record_triple("alice", "knows", "bob");
440 collector.record_triple("alice", "name", "Alice");
441 collector.record_triple("bob", "name", "Bob");
442 collector.record_triple("bob", "knows", "charlie");
443
444 let stats = collector.build();
445
446 assert_eq!(stats.total_triples, 4);
447 assert_eq!(stats.subject_count, 2); assert_eq!(stats.predicate_count, 2); assert_eq!(stats.object_count, 4); }
451
452 #[test]
453 fn test_pattern_cost_estimation() {
454 let index_stats = IndexStatistics::new();
455
456 let cost = index_stats.estimate_pattern_cost(true, false, false);
458 assert_eq!(cost, 1.0);
459
460 let cost = index_stats.estimate_pattern_cost(false, true, false);
462 assert_eq!(cost, 1.5);
463
464 let cost = index_stats.estimate_pattern_cost(false, false, false);
466 assert_eq!(cost, 10.0);
467 }
468}