1use super::histogram::Histogram;
8use grafeo_common::types::Value;
9use std::collections::HashMap;
10
11pub type PropertyKey = String;
13
14#[derive(Debug, Clone, Default)]
19pub struct Statistics {
20 pub labels: HashMap<String, LabelStatistics>,
22 pub edge_types: HashMap<String, EdgeTypeStatistics>,
24 pub properties: HashMap<PropertyKey, ColumnStatistics>,
26 pub total_nodes: u64,
28 pub total_edges: u64,
30}
31
32impl Statistics {
33 pub fn new() -> Self {
35 Self::default()
36 }
37
38 pub fn update_label(&mut self, label: &str, stats: LabelStatistics) {
40 self.labels.insert(label.to_string(), stats);
41 }
42
43 pub fn update_edge_type(&mut self, edge_type: &str, stats: EdgeTypeStatistics) {
45 self.edge_types.insert(edge_type.to_string(), stats);
46 }
47
48 pub fn update_property(&mut self, property: &str, stats: ColumnStatistics) {
50 self.properties.insert(property.to_string(), stats);
51 }
52
53 pub fn get_label(&self, label: &str) -> Option<&LabelStatistics> {
55 self.labels.get(label)
56 }
57
58 pub fn get_edge_type(&self, edge_type: &str) -> Option<&EdgeTypeStatistics> {
60 self.edge_types.get(edge_type)
61 }
62
63 pub fn get_property(&self, property: &str) -> Option<&ColumnStatistics> {
65 self.properties.get(property)
66 }
67
68 pub fn estimate_label_cardinality(&self, label: &str) -> f64 {
70 self.labels
71 .get(label)
72 .map(|s| s.node_count as f64)
73 .unwrap_or(1000.0) }
75
76 pub fn estimate_avg_degree(&self, edge_type: &str, outgoing: bool) -> f64 {
78 self.edge_types
79 .get(edge_type)
80 .map(|s| {
81 if outgoing {
82 s.avg_out_degree
83 } else {
84 s.avg_in_degree
85 }
86 })
87 .unwrap_or(10.0) }
89
90 pub fn estimate_equality_selectivity(&self, property: &str, _value: &Value) -> f64 {
92 self.properties
93 .get(property)
94 .map(|s| {
95 if s.distinct_count > 0 {
96 1.0 / s.distinct_count as f64
97 } else {
98 0.5
99 }
100 })
101 .unwrap_or(0.5)
102 }
103
104 pub fn estimate_range_selectivity(
106 &self,
107 property: &str,
108 lower: Option<&Value>,
109 upper: Option<&Value>,
110 ) -> f64 {
111 self.properties
112 .get(property)
113 .and_then(|s| s.histogram.as_ref())
114 .map(|h| h.estimate_range_selectivity(lower, upper, true, true))
115 .unwrap_or(0.33) }
117}
118
119#[derive(Debug, Clone)]
121pub struct LabelStatistics {
122 pub node_count: u64,
124 pub avg_out_degree: f64,
126 pub avg_in_degree: f64,
128 pub properties: HashMap<PropertyKey, ColumnStatistics>,
130}
131
132impl LabelStatistics {
133 pub fn new(node_count: u64) -> Self {
135 Self {
136 node_count,
137 avg_out_degree: 0.0,
138 avg_in_degree: 0.0,
139 properties: HashMap::new(),
140 }
141 }
142
143 pub fn with_degrees(mut self, out_degree: f64, in_degree: f64) -> Self {
145 self.avg_out_degree = out_degree;
146 self.avg_in_degree = in_degree;
147 self
148 }
149
150 pub fn with_property(mut self, property: &str, stats: ColumnStatistics) -> Self {
152 self.properties.insert(property.to_string(), stats);
153 self
154 }
155}
156
157pub type TableStatistics = LabelStatistics;
159
160#[derive(Debug, Clone)]
162pub struct EdgeTypeStatistics {
163 pub edge_count: u64,
165 pub avg_out_degree: f64,
167 pub avg_in_degree: f64,
169 pub properties: HashMap<PropertyKey, ColumnStatistics>,
171}
172
173impl EdgeTypeStatistics {
174 pub fn new(edge_count: u64, avg_out_degree: f64, avg_in_degree: f64) -> Self {
176 Self {
177 edge_count,
178 avg_out_degree,
179 avg_in_degree,
180 properties: HashMap::new(),
181 }
182 }
183
184 pub fn with_property(mut self, property: &str, stats: ColumnStatistics) -> Self {
186 self.properties.insert(property.to_string(), stats);
187 self
188 }
189}
190
191#[derive(Debug, Clone)]
193pub struct ColumnStatistics {
194 pub distinct_count: u64,
196 pub total_count: u64,
198 pub null_count: u64,
200 pub min_value: Option<Value>,
202 pub max_value: Option<Value>,
204 pub avg_value: Option<f64>,
206 pub histogram: Option<Histogram>,
208 pub most_common: Vec<(Value, f64)>,
210}
211
212impl ColumnStatistics {
213 pub fn new(distinct_count: u64, total_count: u64, null_count: u64) -> Self {
215 Self {
216 distinct_count,
217 total_count,
218 null_count,
219 min_value: None,
220 max_value: None,
221 avg_value: None,
222 histogram: None,
223 most_common: Vec::new(),
224 }
225 }
226
227 pub fn with_min_max(mut self, min: Value, max: Value) -> Self {
229 self.min_value = Some(min);
230 self.max_value = Some(max);
231 self
232 }
233
234 pub fn with_avg(mut self, avg: f64) -> Self {
236 self.avg_value = Some(avg);
237 self
238 }
239
240 pub fn with_histogram(mut self, histogram: Histogram) -> Self {
242 self.histogram = Some(histogram);
243 self
244 }
245
246 pub fn with_most_common(mut self, values: Vec<(Value, f64)>) -> Self {
248 self.most_common = values;
249 self
250 }
251
252 pub fn null_fraction(&self) -> f64 {
254 if self.total_count == 0 {
255 0.0
256 } else {
257 self.null_count as f64 / self.total_count as f64
258 }
259 }
260
261 pub fn estimate_equality_selectivity(&self, value: &Value) -> f64 {
263 for (mcv, freq) in &self.most_common {
265 if mcv == value {
266 return *freq;
267 }
268 }
269
270 if let Some(ref hist) = self.histogram {
272 return hist.estimate_equality_selectivity(value);
273 }
274
275 if self.distinct_count > 0 {
277 1.0 / self.distinct_count as f64
278 } else {
279 0.0
280 }
281 }
282
283 pub fn estimate_range_selectivity(&self, lower: Option<&Value>, upper: Option<&Value>) -> f64 {
285 if let Some(ref hist) = self.histogram {
286 return hist.estimate_range_selectivity(lower, upper, true, true);
287 }
288
289 match (&self.min_value, &self.max_value, lower, upper) {
291 (Some(min), Some(max), Some(l), Some(u)) => {
292 estimate_linear_range(min, max, l, u)
294 }
295 (Some(_), Some(_), Some(_), None) => 0.5, (Some(_), Some(_), None, Some(_)) => 0.5, _ => 0.33, }
299 }
300}
301
302fn estimate_linear_range(min: &Value, max: &Value, lower: &Value, upper: &Value) -> f64 {
304 match (min, max, lower, upper) {
305 (
306 Value::Int64(min_v),
307 Value::Int64(max_v),
308 Value::Int64(lower_v),
309 Value::Int64(upper_v),
310 ) => {
311 let total_range = (max_v - min_v) as f64;
312 if total_range <= 0.0 {
313 return 1.0;
314 }
315
316 let effective_lower = (*lower_v).max(*min_v);
317 let effective_upper = (*upper_v).min(*max_v);
318
319 if effective_upper < effective_lower {
320 return 0.0;
321 }
322
323 (effective_upper - effective_lower) as f64 / total_range
324 }
325 (
326 Value::Float64(min_v),
327 Value::Float64(max_v),
328 Value::Float64(lower_v),
329 Value::Float64(upper_v),
330 ) => {
331 let total_range = max_v - min_v;
332 if total_range <= 0.0 {
333 return 1.0;
334 }
335
336 let effective_lower = lower_v.max(*min_v);
337 let effective_upper = upper_v.min(*max_v);
338
339 if effective_upper < effective_lower {
340 return 0.0;
341 }
342
343 (effective_upper - effective_lower) / total_range
344 }
345 _ => 0.33,
346 }
347}
348
349#[allow(dead_code)]
354pub struct StatisticsCollector {
355 values: Vec<Value>,
357 distinct: std::collections::HashSet<String>,
359 min: Option<Value>,
361 max: Option<Value>,
363 sum: f64,
365 null_count: u64,
367 frequencies: HashMap<String, u64>,
369}
370
371#[allow(dead_code)]
372impl StatisticsCollector {
373 pub fn new() -> Self {
375 Self {
376 values: Vec::new(),
377 distinct: std::collections::HashSet::new(),
378 min: None,
379 max: None,
380 sum: 0.0,
381 null_count: 0,
382 frequencies: HashMap::new(),
383 }
384 }
385
386 pub fn add(&mut self, value: Value) {
388 if matches!(value, Value::Null) {
389 self.null_count += 1;
390 return;
391 }
392
393 let key = format!("{value:?}");
395 self.distinct.insert(key.clone());
396
397 *self.frequencies.entry(key).or_insert(0) += 1;
399
400 self.update_min_max(&value);
402
403 if let Some(v) = value_to_f64(&value) {
405 self.sum += v;
406 }
407
408 self.values.push(value);
409 }
410
411 fn update_min_max(&mut self, value: &Value) {
412 match &self.min {
414 None => self.min = Some(value.clone()),
415 Some(current) => {
416 if compare_values(value, current) == Some(std::cmp::Ordering::Less) {
417 self.min = Some(value.clone());
418 }
419 }
420 }
421
422 match &self.max {
424 None => self.max = Some(value.clone()),
425 Some(current) => {
426 if compare_values(value, current) == Some(std::cmp::Ordering::Greater) {
427 self.max = Some(value.clone());
428 }
429 }
430 }
431 }
432
433 pub fn build(mut self, num_histogram_buckets: usize, num_mcv: usize) -> ColumnStatistics {
435 let total_count = self.values.len() as u64 + self.null_count;
436 let distinct_count = self.distinct.len() as u64;
437
438 let avg = if !self.values.is_empty() {
439 Some(self.sum / self.values.len() as f64)
440 } else {
441 None
442 };
443
444 self.values
446 .sort_by(|a, b| compare_values(a, b).unwrap_or(std::cmp::Ordering::Equal));
447 let histogram = if self.values.len() >= num_histogram_buckets {
448 Some(Histogram::build(&self.values, num_histogram_buckets))
449 } else {
450 None
451 };
452
453 let total_non_null = self.values.len() as f64;
455 let mut freq_vec: Vec<_> = self.frequencies.into_iter().collect();
456 freq_vec.sort_by(|a, b| b.1.cmp(&a.1));
457
458 let most_common: Vec<(Value, f64)> = freq_vec
459 .into_iter()
460 .take(num_mcv)
461 .filter_map(|(key, count)| {
462 let freq = count as f64 / total_non_null;
464 if key.starts_with("Int64(") {
466 let num_str = key.trim_start_matches("Int64(").trim_end_matches(')');
467 num_str.parse::<i64>().ok().map(|n| (Value::Int64(n), freq))
468 } else if key.starts_with("String(") {
469 let s = key
470 .trim_start_matches("String(Arc(\"")
471 .trim_end_matches("\"))");
472 Some((Value::String(s.to_string().into()), freq))
473 } else {
474 None
475 }
476 })
477 .collect();
478
479 let mut stats = ColumnStatistics::new(distinct_count, total_count, self.null_count);
480
481 if let Some(min) = self.min {
482 if let Some(max) = self.max {
483 stats = stats.with_min_max(min, max);
484 }
485 }
486
487 if let Some(avg) = avg {
488 stats = stats.with_avg(avg);
489 }
490
491 if let Some(hist) = histogram {
492 stats = stats.with_histogram(hist);
493 }
494
495 if !most_common.is_empty() {
496 stats = stats.with_most_common(most_common);
497 }
498
499 stats
500 }
501}
502
503impl Default for StatisticsCollector {
504 fn default() -> Self {
505 Self::new()
506 }
507}
508
509#[allow(dead_code)]
511fn value_to_f64(value: &Value) -> Option<f64> {
512 match value {
513 Value::Int64(i) => Some(*i as f64),
514 Value::Float64(f) => Some(*f),
515 _ => None,
516 }
517}
518
519#[allow(dead_code)]
521fn compare_values(a: &Value, b: &Value) -> Option<std::cmp::Ordering> {
522 match (a, b) {
523 (Value::Int64(a), Value::Int64(b)) => Some(a.cmp(b)),
524 (Value::Float64(a), Value::Float64(b)) => a.partial_cmp(b),
525 (Value::String(a), Value::String(b)) => Some(a.cmp(b)),
526 (Value::Bool(a), Value::Bool(b)) => Some(a.cmp(b)),
527 (Value::Int64(a), Value::Float64(b)) => (*a as f64).partial_cmp(b),
528 (Value::Float64(a), Value::Int64(b)) => a.partial_cmp(&(*b as f64)),
529 _ => None,
530 }
531}
532
533#[cfg(test)]
534mod tests {
535 use super::*;
536
537 #[test]
538 fn test_statistics_collector() {
539 let mut collector = StatisticsCollector::new();
540
541 for i in 0..100 {
542 collector.add(Value::Int64(i % 10)); }
544
545 let stats = collector.build(10, 5);
546
547 assert_eq!(stats.distinct_count, 10);
548 assert_eq!(stats.total_count, 100);
549 assert_eq!(stats.null_count, 0);
550 assert_eq!(stats.min_value, Some(Value::Int64(0)));
551 assert_eq!(stats.max_value, Some(Value::Int64(9)));
552 }
553
554 #[test]
555 fn test_statistics_with_nulls() {
556 let mut collector = StatisticsCollector::new();
557
558 collector.add(Value::Int64(1));
559 collector.add(Value::Null);
560 collector.add(Value::Int64(2));
561 collector.add(Value::Null);
562 collector.add(Value::Int64(3));
563
564 let stats = collector.build(5, 3);
565
566 assert_eq!(stats.total_count, 5);
567 assert_eq!(stats.null_count, 2);
568 assert_eq!(stats.distinct_count, 3);
569 assert!((stats.null_fraction() - 0.4).abs() < 0.01);
570 }
571
572 #[test]
573 fn test_label_statistics() {
574 let stats = LabelStatistics::new(1000)
575 .with_degrees(5.0, 3.0)
576 .with_property(
577 "age",
578 ColumnStatistics::new(50, 1000, 10)
579 .with_min_max(Value::Int64(0), Value::Int64(100)),
580 );
581
582 assert_eq!(stats.node_count, 1000);
583 assert_eq!(stats.avg_out_degree, 5.0);
584 assert!(stats.properties.contains_key("age"));
585 }
586
587 #[test]
588 fn test_database_statistics() {
589 let mut db_stats = Statistics::new();
590
591 db_stats.update_label(
592 "Person",
593 LabelStatistics::new(10000).with_degrees(10.0, 10.0),
594 );
595
596 db_stats.update_edge_type("KNOWS", EdgeTypeStatistics::new(50000, 5.0, 5.0));
597
598 assert_eq!(db_stats.estimate_label_cardinality("Person"), 10000.0);
599 assert_eq!(db_stats.estimate_label_cardinality("Unknown"), 1000.0); assert_eq!(db_stats.estimate_avg_degree("KNOWS", true), 5.0);
602 }
603}