1use super::histogram::Histogram;
8use grafeo_common::types::Value;
9use std::collections::HashMap;
10
11pub type PropertyKey = String;
13
14#[derive(Debug, Clone, Default)]
19pub struct Statistics {
20 pub labels: HashMap<String, LabelStatistics>,
22 pub edge_types: HashMap<String, EdgeTypeStatistics>,
24 pub properties: HashMap<PropertyKey, ColumnStatistics>,
26 pub total_nodes: u64,
28 pub total_edges: u64,
30}
31
32impl Statistics {
33 pub fn new() -> Self {
35 Self::default()
36 }
37
38 pub fn update_label(&mut self, label: &str, stats: LabelStatistics) {
40 self.labels.insert(label.to_string(), stats);
41 }
42
43 pub fn update_edge_type(&mut self, edge_type: &str, stats: EdgeTypeStatistics) {
45 self.edge_types.insert(edge_type.to_string(), stats);
46 }
47
48 pub fn update_property(&mut self, property: &str, stats: ColumnStatistics) {
50 self.properties.insert(property.to_string(), stats);
51 }
52
53 pub fn get_label(&self, label: &str) -> Option<&LabelStatistics> {
55 self.labels.get(label)
56 }
57
58 pub fn get_edge_type(&self, edge_type: &str) -> Option<&EdgeTypeStatistics> {
60 self.edge_types.get(edge_type)
61 }
62
63 pub fn get_property(&self, property: &str) -> Option<&ColumnStatistics> {
65 self.properties.get(property)
66 }
67
68 pub fn estimate_label_cardinality(&self, label: &str) -> f64 {
70 self.labels
71 .get(label)
72 .map_or(1000.0, |s| s.node_count as f64) }
74
75 pub fn estimate_avg_degree(&self, edge_type: &str, outgoing: bool) -> f64 {
77 self.edge_types.get(edge_type).map_or(10.0, |s| {
78 if outgoing {
79 s.avg_out_degree
80 } else {
81 s.avg_in_degree
82 }
83 }) }
85
86 pub fn estimate_equality_selectivity(&self, property: &str, _value: &Value) -> f64 {
88 self.properties.get(property).map_or(0.5, |s| {
89 if s.distinct_count > 0 {
90 1.0 / s.distinct_count as f64
91 } else {
92 0.5
93 }
94 })
95 }
96
97 pub fn estimate_range_selectivity(
99 &self,
100 property: &str,
101 lower: Option<&Value>,
102 upper: Option<&Value>,
103 ) -> f64 {
104 self.properties
105 .get(property)
106 .and_then(|s| s.histogram.as_ref())
107 .map_or(0.33, |h| {
108 h.estimate_range_selectivity(lower, upper, true, true)
109 }) }
111}
112
113#[derive(Debug, Clone)]
115pub struct LabelStatistics {
116 pub node_count: u64,
118 pub avg_out_degree: f64,
120 pub avg_in_degree: f64,
122 pub properties: HashMap<PropertyKey, ColumnStatistics>,
124}
125
126impl LabelStatistics {
127 pub fn new(node_count: u64) -> Self {
129 Self {
130 node_count,
131 avg_out_degree: 0.0,
132 avg_in_degree: 0.0,
133 properties: HashMap::new(),
134 }
135 }
136
137 pub fn with_degrees(mut self, out_degree: f64, in_degree: f64) -> Self {
139 self.avg_out_degree = out_degree;
140 self.avg_in_degree = in_degree;
141 self
142 }
143
144 pub fn with_property(mut self, property: &str, stats: ColumnStatistics) -> Self {
146 self.properties.insert(property.to_string(), stats);
147 self
148 }
149}
150
151pub type TableStatistics = LabelStatistics;
153
154#[derive(Debug, Clone)]
156pub struct EdgeTypeStatistics {
157 pub edge_count: u64,
159 pub avg_out_degree: f64,
161 pub avg_in_degree: f64,
163 pub properties: HashMap<PropertyKey, ColumnStatistics>,
165}
166
167impl EdgeTypeStatistics {
168 pub fn new(edge_count: u64, avg_out_degree: f64, avg_in_degree: f64) -> Self {
170 Self {
171 edge_count,
172 avg_out_degree,
173 avg_in_degree,
174 properties: HashMap::new(),
175 }
176 }
177
178 pub fn with_property(mut self, property: &str, stats: ColumnStatistics) -> Self {
180 self.properties.insert(property.to_string(), stats);
181 self
182 }
183}
184
185#[derive(Debug, Clone)]
187pub struct ColumnStatistics {
188 pub distinct_count: u64,
190 pub total_count: u64,
192 pub null_count: u64,
194 pub min_value: Option<Value>,
196 pub max_value: Option<Value>,
198 pub avg_value: Option<f64>,
200 pub histogram: Option<Histogram>,
202 pub most_common: Vec<(Value, f64)>,
204}
205
206impl ColumnStatistics {
207 pub fn new(distinct_count: u64, total_count: u64, null_count: u64) -> Self {
209 Self {
210 distinct_count,
211 total_count,
212 null_count,
213 min_value: None,
214 max_value: None,
215 avg_value: None,
216 histogram: None,
217 most_common: Vec::new(),
218 }
219 }
220
221 pub fn with_min_max(mut self, min: Value, max: Value) -> Self {
223 self.min_value = Some(min);
224 self.max_value = Some(max);
225 self
226 }
227
228 pub fn with_avg(mut self, avg: f64) -> Self {
230 self.avg_value = Some(avg);
231 self
232 }
233
234 pub fn with_histogram(mut self, histogram: Histogram) -> Self {
236 self.histogram = Some(histogram);
237 self
238 }
239
240 pub fn with_most_common(mut self, values: Vec<(Value, f64)>) -> Self {
242 self.most_common = values;
243 self
244 }
245
246 pub fn null_fraction(&self) -> f64 {
248 if self.total_count == 0 {
249 0.0
250 } else {
251 self.null_count as f64 / self.total_count as f64
252 }
253 }
254
255 pub fn estimate_equality_selectivity(&self, value: &Value) -> f64 {
257 for (mcv, freq) in &self.most_common {
259 if mcv == value {
260 return *freq;
261 }
262 }
263
264 if let Some(ref hist) = self.histogram {
266 return hist.estimate_equality_selectivity(value);
267 }
268
269 if self.distinct_count > 0 {
271 1.0 / self.distinct_count as f64
272 } else {
273 0.0
274 }
275 }
276
277 pub fn estimate_range_selectivity(&self, lower: Option<&Value>, upper: Option<&Value>) -> f64 {
279 if let Some(ref hist) = self.histogram {
280 return hist.estimate_range_selectivity(lower, upper, true, true);
281 }
282
283 match (&self.min_value, &self.max_value, lower, upper) {
285 (Some(min), Some(max), Some(l), Some(u)) => {
286 estimate_linear_range(min, max, l, u)
288 }
289 (Some(_), Some(_), Some(_), None) => 0.5, (Some(_), Some(_), None, Some(_)) => 0.5, _ => 0.33, }
293 }
294}
295
296fn estimate_linear_range(min: &Value, max: &Value, lower: &Value, upper: &Value) -> f64 {
298 match (min, max, lower, upper) {
299 (
300 Value::Int64(min_v),
301 Value::Int64(max_v),
302 Value::Int64(lower_v),
303 Value::Int64(upper_v),
304 ) => {
305 let total_range = (max_v - min_v) as f64;
306 if total_range <= 0.0 {
307 return 1.0;
308 }
309
310 let effective_lower = (*lower_v).max(*min_v);
311 let effective_upper = (*upper_v).min(*max_v);
312
313 if effective_upper < effective_lower {
314 return 0.0;
315 }
316
317 (effective_upper - effective_lower) as f64 / total_range
318 }
319 (
320 Value::Float64(min_v),
321 Value::Float64(max_v),
322 Value::Float64(lower_v),
323 Value::Float64(upper_v),
324 ) => {
325 let total_range = max_v - min_v;
326 if total_range <= 0.0 {
327 return 1.0;
328 }
329
330 let effective_lower = lower_v.max(*min_v);
331 let effective_upper = upper_v.min(*max_v);
332
333 if effective_upper < effective_lower {
334 return 0.0;
335 }
336
337 (effective_upper - effective_lower) / total_range
338 }
339 _ => 0.33,
340 }
341}
342
343#[allow(dead_code)] pub struct StatisticsCollector {
349 values: Vec<Value>,
351 distinct: std::collections::HashSet<String>,
353 min: Option<Value>,
355 max: Option<Value>,
357 sum: f64,
359 null_count: u64,
361 frequencies: HashMap<String, u64>,
363}
364
365#[allow(dead_code)] impl StatisticsCollector {
367 pub fn new() -> Self {
369 Self {
370 values: Vec::new(),
371 distinct: std::collections::HashSet::new(),
372 min: None,
373 max: None,
374 sum: 0.0,
375 null_count: 0,
376 frequencies: HashMap::new(),
377 }
378 }
379
380 pub fn add(&mut self, value: Value) {
382 if matches!(value, Value::Null) {
383 self.null_count += 1;
384 return;
385 }
386
387 let key = format!("{value:?}");
389 self.distinct.insert(key.clone());
390
391 *self.frequencies.entry(key).or_insert(0) += 1;
393
394 self.update_min_max(&value);
396
397 if let Some(v) = value_to_f64(&value) {
399 self.sum += v;
400 }
401
402 self.values.push(value);
403 }
404
405 fn update_min_max(&mut self, value: &Value) {
406 match &self.min {
408 None => self.min = Some(value.clone()),
409 Some(current) => {
410 if compare_values(value, current) == Some(std::cmp::Ordering::Less) {
411 self.min = Some(value.clone());
412 }
413 }
414 }
415
416 match &self.max {
418 None => self.max = Some(value.clone()),
419 Some(current) => {
420 if compare_values(value, current) == Some(std::cmp::Ordering::Greater) {
421 self.max = Some(value.clone());
422 }
423 }
424 }
425 }
426
427 pub fn build(mut self, num_histogram_buckets: usize, num_mcv: usize) -> ColumnStatistics {
429 let total_count = self.values.len() as u64 + self.null_count;
430 let distinct_count = self.distinct.len() as u64;
431
432 let avg = if !self.values.is_empty() {
433 Some(self.sum / self.values.len() as f64)
434 } else {
435 None
436 };
437
438 self.values
440 .sort_by(|a, b| compare_values(a, b).unwrap_or(std::cmp::Ordering::Equal));
441 let histogram = if self.values.len() >= num_histogram_buckets {
442 Some(Histogram::build(&self.values, num_histogram_buckets))
443 } else {
444 None
445 };
446
447 let total_non_null = self.values.len() as f64;
449 let mut freq_vec: Vec<_> = self.frequencies.into_iter().collect();
450 freq_vec.sort_by(|a, b| b.1.cmp(&a.1));
451
452 let most_common: Vec<(Value, f64)> = freq_vec
453 .into_iter()
454 .take(num_mcv)
455 .filter_map(|(key, count)| {
456 let freq = count as f64 / total_non_null;
458 if key.starts_with("Int64(") {
460 let num_str = key.trim_start_matches("Int64(").trim_end_matches(')');
461 num_str.parse::<i64>().ok().map(|n| (Value::Int64(n), freq))
462 } else if key.starts_with("String(") {
463 let s = key
464 .trim_start_matches("String(Arc(\"")
465 .trim_end_matches("\"))");
466 Some((Value::String(s.to_string().into()), freq))
467 } else {
468 None
469 }
470 })
471 .collect();
472
473 let mut stats = ColumnStatistics::new(distinct_count, total_count, self.null_count);
474
475 if let Some(min) = self.min
476 && let Some(max) = self.max
477 {
478 stats = stats.with_min_max(min, max);
479 }
480
481 if let Some(avg) = avg {
482 stats = stats.with_avg(avg);
483 }
484
485 if let Some(hist) = histogram {
486 stats = stats.with_histogram(hist);
487 }
488
489 if !most_common.is_empty() {
490 stats = stats.with_most_common(most_common);
491 }
492
493 stats
494 }
495}
496
497impl Default for StatisticsCollector {
498 fn default() -> Self {
499 Self::new()
500 }
501}
502
503#[allow(dead_code)] fn value_to_f64(value: &Value) -> Option<f64> {
506 match value {
507 Value::Int64(i) => Some(*i as f64),
508 Value::Float64(f) => Some(*f),
509 _ => None,
510 }
511}
512
513#[allow(dead_code)] fn compare_values(a: &Value, b: &Value) -> Option<std::cmp::Ordering> {
516 match (a, b) {
517 (Value::Int64(a), Value::Int64(b)) => Some(a.cmp(b)),
518 (Value::Float64(a), Value::Float64(b)) => a.partial_cmp(b),
519 (Value::String(a), Value::String(b)) => Some(a.cmp(b)),
520 (Value::Bool(a), Value::Bool(b)) => Some(a.cmp(b)),
521 (Value::Int64(a), Value::Float64(b)) => (*a as f64).partial_cmp(b),
522 (Value::Float64(a), Value::Int64(b)) => a.partial_cmp(&(*b as f64)),
523 _ => None,
524 }
525}
526
527#[cfg(test)]
528mod tests {
529 use super::*;
530
531 #[test]
532 fn test_statistics_collector() {
533 let mut collector = StatisticsCollector::new();
534
535 for i in 0..100 {
536 collector.add(Value::Int64(i % 10)); }
538
539 let stats = collector.build(10, 5);
540
541 assert_eq!(stats.distinct_count, 10);
542 assert_eq!(stats.total_count, 100);
543 assert_eq!(stats.null_count, 0);
544 assert_eq!(stats.min_value, Some(Value::Int64(0)));
545 assert_eq!(stats.max_value, Some(Value::Int64(9)));
546 }
547
548 #[test]
549 fn test_statistics_with_nulls() {
550 let mut collector = StatisticsCollector::new();
551
552 collector.add(Value::Int64(1));
553 collector.add(Value::Null);
554 collector.add(Value::Int64(2));
555 collector.add(Value::Null);
556 collector.add(Value::Int64(3));
557
558 let stats = collector.build(5, 3);
559
560 assert_eq!(stats.total_count, 5);
561 assert_eq!(stats.null_count, 2);
562 assert_eq!(stats.distinct_count, 3);
563 assert!((stats.null_fraction() - 0.4).abs() < 0.01);
564 }
565
566 #[test]
567 fn test_label_statistics() {
568 let stats = LabelStatistics::new(1000)
569 .with_degrees(5.0, 3.0)
570 .with_property(
571 "age",
572 ColumnStatistics::new(50, 1000, 10)
573 .with_min_max(Value::Int64(0), Value::Int64(100)),
574 );
575
576 assert_eq!(stats.node_count, 1000);
577 assert_eq!(stats.avg_out_degree, 5.0);
578 assert!(stats.properties.contains_key("age"));
579 }
580
581 #[test]
582 fn test_statistics_min_max_updates() {
583 let mut collector = StatisticsCollector::new();
585
586 collector.add(Value::Int64(50));
587 collector.add(Value::Int64(10)); collector.add(Value::Int64(90)); collector.add(Value::Int64(5)); collector.add(Value::Int64(95)); let stats = collector.build(2, 3);
593
594 assert_eq!(stats.min_value, Some(Value::Int64(5)));
595 assert_eq!(stats.max_value, Some(Value::Int64(95)));
596 }
597
598 #[test]
599 fn test_statistics_most_common_values() {
600 let mut collector = StatisticsCollector::new();
601
602 for _ in 0..50 {
604 collector.add(Value::Int64(42));
605 }
606 for _ in 0..30 {
607 collector.add(Value::Int64(7));
608 }
609 for _ in 0..20 {
610 collector.add(Value::String("hello".into()));
611 }
612
613 let stats = collector.build(5, 3);
614
615 assert!(
617 !stats.most_common.is_empty(),
618 "MCV list should be populated"
619 );
620
621 let (top_val, top_freq) = &stats.most_common[0];
623 assert_eq!(*top_val, Value::Int64(42));
624 assert!((top_freq - 0.5).abs() < 0.01, "42 appears 50/100 = 0.5");
625
626 let has_string = stats
628 .most_common
629 .iter()
630 .any(|(v, _)| matches!(v, Value::String(_)));
631 assert!(has_string, "String MCVs should be parsed back");
632 }
633
634 #[test]
635 fn test_database_statistics() {
636 let mut db_stats = Statistics::new();
637
638 db_stats.update_label(
639 "Person",
640 LabelStatistics::new(10000).with_degrees(10.0, 10.0),
641 );
642
643 db_stats.update_edge_type("KNOWS", EdgeTypeStatistics::new(50000, 5.0, 5.0));
644
645 assert_eq!(db_stats.estimate_label_cardinality("Person"), 10000.0);
646 assert_eq!(db_stats.estimate_label_cardinality("Unknown"), 1000.0); assert_eq!(db_stats.estimate_avg_degree("KNOWS", true), 5.0);
649 }
650}