1use super::histogram::Histogram;
7use grafeo_common::types::Value;
8use std::collections::HashMap;
9
10pub type PropertyKey = String;
12
13#[derive(Debug, Clone, Default)]
15pub struct Statistics {
16 pub labels: HashMap<String, LabelStatistics>,
18 pub edge_types: HashMap<String, EdgeTypeStatistics>,
20 pub properties: HashMap<PropertyKey, ColumnStatistics>,
22 pub total_nodes: u64,
24 pub total_edges: u64,
26}
27
28impl Statistics {
29 pub fn new() -> Self {
31 Self::default()
32 }
33
34 pub fn update_label(&mut self, label: &str, stats: LabelStatistics) {
36 self.labels.insert(label.to_string(), stats);
37 }
38
39 pub fn update_edge_type(&mut self, edge_type: &str, stats: EdgeTypeStatistics) {
41 self.edge_types.insert(edge_type.to_string(), stats);
42 }
43
44 pub fn update_property(&mut self, property: &str, stats: ColumnStatistics) {
46 self.properties.insert(property.to_string(), stats);
47 }
48
49 pub fn get_label(&self, label: &str) -> Option<&LabelStatistics> {
51 self.labels.get(label)
52 }
53
54 pub fn get_edge_type(&self, edge_type: &str) -> Option<&EdgeTypeStatistics> {
56 self.edge_types.get(edge_type)
57 }
58
59 pub fn get_property(&self, property: &str) -> Option<&ColumnStatistics> {
61 self.properties.get(property)
62 }
63
64 pub fn estimate_label_cardinality(&self, label: &str) -> f64 {
66 self.labels
67 .get(label)
68 .map(|s| s.node_count as f64)
69 .unwrap_or(1000.0) }
71
72 pub fn estimate_avg_degree(&self, edge_type: &str, outgoing: bool) -> f64 {
74 self.edge_types
75 .get(edge_type)
76 .map(|s| {
77 if outgoing {
78 s.avg_out_degree
79 } else {
80 s.avg_in_degree
81 }
82 })
83 .unwrap_or(10.0) }
85
86 pub fn estimate_equality_selectivity(&self, property: &str, _value: &Value) -> f64 {
88 self.properties
89 .get(property)
90 .map(|s| {
91 if s.distinct_count > 0 {
92 1.0 / s.distinct_count as f64
93 } else {
94 0.5
95 }
96 })
97 .unwrap_or(0.5)
98 }
99
100 pub fn estimate_range_selectivity(
102 &self,
103 property: &str,
104 lower: Option<&Value>,
105 upper: Option<&Value>,
106 ) -> f64 {
107 self.properties
108 .get(property)
109 .and_then(|s| s.histogram.as_ref())
110 .map(|h| h.estimate_range_selectivity(lower, upper, true, true))
111 .unwrap_or(0.33) }
113}
114
115#[derive(Debug, Clone)]
117pub struct LabelStatistics {
118 pub node_count: u64,
120 pub avg_out_degree: f64,
122 pub avg_in_degree: f64,
124 pub properties: HashMap<PropertyKey, ColumnStatistics>,
126}
127
128impl LabelStatistics {
129 pub fn new(node_count: u64) -> Self {
131 Self {
132 node_count,
133 avg_out_degree: 0.0,
134 avg_in_degree: 0.0,
135 properties: HashMap::new(),
136 }
137 }
138
139 pub fn with_degrees(mut self, out_degree: f64, in_degree: f64) -> Self {
141 self.avg_out_degree = out_degree;
142 self.avg_in_degree = in_degree;
143 self
144 }
145
146 pub fn with_property(mut self, property: &str, stats: ColumnStatistics) -> Self {
148 self.properties.insert(property.to_string(), stats);
149 self
150 }
151}
152
153pub type TableStatistics = LabelStatistics;
155
156#[derive(Debug, Clone)]
158pub struct EdgeTypeStatistics {
159 pub edge_count: u64,
161 pub avg_out_degree: f64,
163 pub avg_in_degree: f64,
165 pub properties: HashMap<PropertyKey, ColumnStatistics>,
167}
168
169impl EdgeTypeStatistics {
170 pub fn new(edge_count: u64, avg_out_degree: f64, avg_in_degree: f64) -> Self {
172 Self {
173 edge_count,
174 avg_out_degree,
175 avg_in_degree,
176 properties: HashMap::new(),
177 }
178 }
179
180 pub fn with_property(mut self, property: &str, stats: ColumnStatistics) -> Self {
182 self.properties.insert(property.to_string(), stats);
183 self
184 }
185}
186
187#[derive(Debug, Clone)]
189pub struct ColumnStatistics {
190 pub distinct_count: u64,
192 pub total_count: u64,
194 pub null_count: u64,
196 pub min_value: Option<Value>,
198 pub max_value: Option<Value>,
200 pub avg_value: Option<f64>,
202 pub histogram: Option<Histogram>,
204 pub most_common: Vec<(Value, f64)>,
206}
207
208impl ColumnStatistics {
209 pub fn new(distinct_count: u64, total_count: u64, null_count: u64) -> Self {
211 Self {
212 distinct_count,
213 total_count,
214 null_count,
215 min_value: None,
216 max_value: None,
217 avg_value: None,
218 histogram: None,
219 most_common: Vec::new(),
220 }
221 }
222
223 pub fn with_min_max(mut self, min: Value, max: Value) -> Self {
225 self.min_value = Some(min);
226 self.max_value = Some(max);
227 self
228 }
229
230 pub fn with_avg(mut self, avg: f64) -> Self {
232 self.avg_value = Some(avg);
233 self
234 }
235
236 pub fn with_histogram(mut self, histogram: Histogram) -> Self {
238 self.histogram = Some(histogram);
239 self
240 }
241
242 pub fn with_most_common(mut self, values: Vec<(Value, f64)>) -> Self {
244 self.most_common = values;
245 self
246 }
247
248 pub fn null_fraction(&self) -> f64 {
250 if self.total_count == 0 {
251 0.0
252 } else {
253 self.null_count as f64 / self.total_count as f64
254 }
255 }
256
257 pub fn estimate_equality_selectivity(&self, value: &Value) -> f64 {
259 for (mcv, freq) in &self.most_common {
261 if mcv == value {
262 return *freq;
263 }
264 }
265
266 if let Some(ref hist) = self.histogram {
268 return hist.estimate_equality_selectivity(value);
269 }
270
271 if self.distinct_count > 0 {
273 1.0 / self.distinct_count as f64
274 } else {
275 0.0
276 }
277 }
278
279 pub fn estimate_range_selectivity(&self, lower: Option<&Value>, upper: Option<&Value>) -> f64 {
281 if let Some(ref hist) = self.histogram {
282 return hist.estimate_range_selectivity(lower, upper, true, true);
283 }
284
285 match (&self.min_value, &self.max_value, lower, upper) {
287 (Some(min), Some(max), Some(l), Some(u)) => {
288 estimate_linear_range(min, max, l, u)
290 }
291 (Some(_), Some(_), Some(_), None) => 0.5, (Some(_), Some(_), None, Some(_)) => 0.5, _ => 0.33, }
295 }
296}
297
298fn estimate_linear_range(min: &Value, max: &Value, lower: &Value, upper: &Value) -> f64 {
300 match (min, max, lower, upper) {
301 (
302 Value::Int64(min_v),
303 Value::Int64(max_v),
304 Value::Int64(lower_v),
305 Value::Int64(upper_v),
306 ) => {
307 let total_range = (max_v - min_v) as f64;
308 if total_range <= 0.0 {
309 return 1.0;
310 }
311
312 let effective_lower = (*lower_v).max(*min_v);
313 let effective_upper = (*upper_v).min(*max_v);
314
315 if effective_upper < effective_lower {
316 return 0.0;
317 }
318
319 (effective_upper - effective_lower) as f64 / total_range
320 }
321 (
322 Value::Float64(min_v),
323 Value::Float64(max_v),
324 Value::Float64(lower_v),
325 Value::Float64(upper_v),
326 ) => {
327 let total_range = max_v - min_v;
328 if total_range <= 0.0 {
329 return 1.0;
330 }
331
332 let effective_lower = lower_v.max(*min_v);
333 let effective_upper = upper_v.min(*max_v);
334
335 if effective_upper < effective_lower {
336 return 0.0;
337 }
338
339 (effective_upper - effective_lower) / total_range
340 }
341 _ => 0.33,
342 }
343}
344
345#[allow(dead_code)]
347pub struct StatisticsCollector {
348 values: Vec<Value>,
350 distinct: std::collections::HashSet<String>,
352 min: Option<Value>,
354 max: Option<Value>,
356 sum: f64,
358 null_count: u64,
360 frequencies: HashMap<String, u64>,
362}
363
364#[allow(dead_code)]
365impl StatisticsCollector {
366 pub fn new() -> Self {
368 Self {
369 values: Vec::new(),
370 distinct: std::collections::HashSet::new(),
371 min: None,
372 max: None,
373 sum: 0.0,
374 null_count: 0,
375 frequencies: HashMap::new(),
376 }
377 }
378
379 pub fn add(&mut self, value: Value) {
381 if matches!(value, Value::Null) {
382 self.null_count += 1;
383 return;
384 }
385
386 let key = format!("{value:?}");
388 self.distinct.insert(key.clone());
389
390 *self.frequencies.entry(key).or_insert(0) += 1;
392
393 self.update_min_max(&value);
395
396 if let Some(v) = value_to_f64(&value) {
398 self.sum += v;
399 }
400
401 self.values.push(value);
402 }
403
404 fn update_min_max(&mut self, value: &Value) {
405 match &self.min {
407 None => self.min = Some(value.clone()),
408 Some(current) => {
409 if compare_values(value, current) == Some(std::cmp::Ordering::Less) {
410 self.min = Some(value.clone());
411 }
412 }
413 }
414
415 match &self.max {
417 None => self.max = Some(value.clone()),
418 Some(current) => {
419 if compare_values(value, current) == Some(std::cmp::Ordering::Greater) {
420 self.max = Some(value.clone());
421 }
422 }
423 }
424 }
425
426 pub fn build(mut self, num_histogram_buckets: usize, num_mcv: usize) -> ColumnStatistics {
428 let total_count = self.values.len() as u64 + self.null_count;
429 let distinct_count = self.distinct.len() as u64;
430
431 let avg = if !self.values.is_empty() {
432 Some(self.sum / self.values.len() as f64)
433 } else {
434 None
435 };
436
437 self.values
439 .sort_by(|a, b| compare_values(a, b).unwrap_or(std::cmp::Ordering::Equal));
440 let histogram = if self.values.len() >= num_histogram_buckets {
441 Some(Histogram::build(&self.values, num_histogram_buckets))
442 } else {
443 None
444 };
445
446 let total_non_null = self.values.len() as f64;
448 let mut freq_vec: Vec<_> = self.frequencies.into_iter().collect();
449 freq_vec.sort_by(|a, b| b.1.cmp(&a.1));
450
451 let most_common: Vec<(Value, f64)> = freq_vec
452 .into_iter()
453 .take(num_mcv)
454 .filter_map(|(key, count)| {
455 let freq = count as f64 / total_non_null;
457 if key.starts_with("Int64(") {
459 let num_str = key.trim_start_matches("Int64(").trim_end_matches(')');
460 num_str.parse::<i64>().ok().map(|n| (Value::Int64(n), freq))
461 } else if key.starts_with("String(") {
462 let s = key
463 .trim_start_matches("String(Arc(\"")
464 .trim_end_matches("\"))");
465 Some((Value::String(s.to_string().into()), freq))
466 } else {
467 None
468 }
469 })
470 .collect();
471
472 let mut stats = ColumnStatistics::new(distinct_count, total_count, self.null_count);
473
474 if let Some(min) = self.min {
475 if let Some(max) = self.max {
476 stats = stats.with_min_max(min, max);
477 }
478 }
479
480 if let Some(avg) = avg {
481 stats = stats.with_avg(avg);
482 }
483
484 if let Some(hist) = histogram {
485 stats = stats.with_histogram(hist);
486 }
487
488 if !most_common.is_empty() {
489 stats = stats.with_most_common(most_common);
490 }
491
492 stats
493 }
494}
495
496impl Default for StatisticsCollector {
497 fn default() -> Self {
498 Self::new()
499 }
500}
501
502#[allow(dead_code)]
504fn value_to_f64(value: &Value) -> Option<f64> {
505 match value {
506 Value::Int64(i) => Some(*i as f64),
507 Value::Float64(f) => Some(*f),
508 _ => None,
509 }
510}
511
512#[allow(dead_code)]
514fn compare_values(a: &Value, b: &Value) -> Option<std::cmp::Ordering> {
515 match (a, b) {
516 (Value::Int64(a), Value::Int64(b)) => Some(a.cmp(b)),
517 (Value::Float64(a), Value::Float64(b)) => a.partial_cmp(b),
518 (Value::String(a), Value::String(b)) => Some(a.cmp(b)),
519 (Value::Bool(a), Value::Bool(b)) => Some(a.cmp(b)),
520 (Value::Int64(a), Value::Float64(b)) => (*a as f64).partial_cmp(b),
521 (Value::Float64(a), Value::Int64(b)) => a.partial_cmp(&(*b as f64)),
522 _ => None,
523 }
524}
525
526#[cfg(test)]
527mod tests {
528 use super::*;
529
530 #[test]
531 fn test_statistics_collector() {
532 let mut collector = StatisticsCollector::new();
533
534 for i in 0..100 {
535 collector.add(Value::Int64(i % 10)); }
537
538 let stats = collector.build(10, 5);
539
540 assert_eq!(stats.distinct_count, 10);
541 assert_eq!(stats.total_count, 100);
542 assert_eq!(stats.null_count, 0);
543 assert_eq!(stats.min_value, Some(Value::Int64(0)));
544 assert_eq!(stats.max_value, Some(Value::Int64(9)));
545 }
546
547 #[test]
548 fn test_statistics_with_nulls() {
549 let mut collector = StatisticsCollector::new();
550
551 collector.add(Value::Int64(1));
552 collector.add(Value::Null);
553 collector.add(Value::Int64(2));
554 collector.add(Value::Null);
555 collector.add(Value::Int64(3));
556
557 let stats = collector.build(5, 3);
558
559 assert_eq!(stats.total_count, 5);
560 assert_eq!(stats.null_count, 2);
561 assert_eq!(stats.distinct_count, 3);
562 assert!((stats.null_fraction() - 0.4).abs() < 0.01);
563 }
564
565 #[test]
566 fn test_label_statistics() {
567 let stats = LabelStatistics::new(1000)
568 .with_degrees(5.0, 3.0)
569 .with_property(
570 "age",
571 ColumnStatistics::new(50, 1000, 10)
572 .with_min_max(Value::Int64(0), Value::Int64(100)),
573 );
574
575 assert_eq!(stats.node_count, 1000);
576 assert_eq!(stats.avg_out_degree, 5.0);
577 assert!(stats.properties.contains_key("age"));
578 }
579
580 #[test]
581 fn test_database_statistics() {
582 let mut db_stats = Statistics::new();
583
584 db_stats.update_label(
585 "Person",
586 LabelStatistics::new(10000).with_degrees(10.0, 10.0),
587 );
588
589 db_stats.update_edge_type("KNOWS", EdgeTypeStatistics::new(50000, 5.0, 5.0));
590
591 assert_eq!(db_stats.estimate_label_cardinality("Person"), 10000.0);
592 assert_eq!(db_stats.estimate_label_cardinality("Unknown"), 1000.0); assert_eq!(db_stats.estimate_avg_degree("KNOWS", true), 5.0);
595 }
596}