use std::collections::HashMap;
use serde::{Deserialize, Serialize};
use super::CsrIndex;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LabelStats {
pub edge_count: usize,
pub distinct_sources: usize,
pub distinct_targets: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DegreeHistogram {
pub min: usize,
pub max: usize,
pub avg: f64,
pub p50: usize,
pub p95: usize,
pub p99: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GraphStatistics {
pub node_count: usize,
pub edge_count: usize,
pub label_count: usize,
pub label_stats: HashMap<String, LabelStats>,
pub out_degree_histogram: DegreeHistogram,
pub in_degree_histogram: DegreeHistogram,
}
impl CsrIndex {
pub fn compute_statistics(&self) -> GraphStatistics {
let n = self.node_count();
if n == 0 {
return GraphStatistics {
node_count: 0,
edge_count: 0,
label_count: 0,
label_stats: HashMap::new(),
out_degree_histogram: DegreeHistogram {
min: 0,
max: 0,
avg: 0.0,
p50: 0,
p95: 0,
p99: 0,
},
in_degree_histogram: DegreeHistogram {
min: 0,
max: 0,
avg: 0.0,
p50: 0,
p95: 0,
p99: 0,
},
};
}
let mut label_edge_count: HashMap<u32, usize> = HashMap::new();
let mut label_sources: HashMap<u32, std::collections::HashSet<u32>> = HashMap::new();
let mut label_targets: HashMap<u32, std::collections::HashSet<u32>> = HashMap::new();
let mut out_degrees: Vec<usize> = Vec::with_capacity(n);
let mut in_degrees: Vec<usize> = Vec::with_capacity(n);
let mut total_edges = 0usize;
for node in 0..n {
let node_id = node as u32;
let mut out_deg = 0usize;
let mut in_deg = 0usize;
for (lid, dst) in self.iter_out_edges(node_id) {
out_deg += 1;
total_edges += 1;
*label_edge_count.entry(lid).or_insert(0) += 1;
label_sources.entry(lid).or_default().insert(node_id);
label_targets.entry(lid).or_default().insert(dst);
}
for (_lid, _src) in self.iter_in_edges(node_id) {
in_deg += 1;
}
out_degrees.push(out_deg);
in_degrees.push(in_deg);
}
let mut label_stats = HashMap::new();
for (&lid, &count) in &label_edge_count {
let label_name = self.label_name(lid).to_string();
label_stats.insert(
label_name,
LabelStats {
edge_count: count,
distinct_sources: label_sources.get(&lid).map_or(0, |s| s.len()),
distinct_targets: label_targets.get(&lid).map_or(0, |s| s.len()),
},
);
}
GraphStatistics {
node_count: n,
edge_count: total_edges,
label_count: label_edge_count.len(),
label_stats,
out_degree_histogram: compute_histogram(&out_degrees),
in_degree_histogram: compute_histogram(&in_degrees),
}
}
pub fn label_edge_count(&self, label: &str) -> usize {
let Some(lid) = self.label_id(label) else {
return 0;
};
let n = self.node_count();
let mut count = 0usize;
for node in 0..n {
for (l, _dst) in self.iter_out_edges(node as u32) {
if l == lid {
count += 1;
}
}
}
count
}
pub fn label_selectivity(&self, label: &str) -> f64 {
let total = self.edge_count();
if total == 0 {
return 0.0;
}
let count = self.label_edge_count(label);
if count == 0 {
return 1.0; }
count as f64 / total as f64
}
}
fn compute_histogram(degrees: &[usize]) -> DegreeHistogram {
if degrees.is_empty() {
return DegreeHistogram {
min: 0,
max: 0,
avg: 0.0,
p50: 0,
p95: 0,
p99: 0,
};
}
let mut sorted = degrees.to_vec();
sorted.sort_unstable();
let n = sorted.len();
let sum: usize = sorted.iter().sum();
DegreeHistogram {
min: sorted[0],
max: sorted[n - 1],
avg: sum as f64 / n as f64,
p50: sorted[n / 2],
p95: sorted[(n as f64 * 0.95) as usize],
p99: sorted[((n as f64 * 0.99) as usize).min(n - 1)],
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn statistics_empty_graph() {
let csr = CsrIndex::new();
let stats = csr.compute_statistics();
assert_eq!(stats.node_count, 0);
assert_eq!(stats.edge_count, 0);
assert_eq!(stats.label_count, 0);
}
#[test]
fn statistics_basic() {
let mut csr = CsrIndex::new();
csr.add_edge("a", "KNOWS", "b").unwrap();
csr.add_edge("b", "KNOWS", "c").unwrap();
csr.add_edge("a", "LIKES", "c").unwrap();
csr.compact();
let stats = csr.compute_statistics();
assert_eq!(stats.node_count, 3);
assert_eq!(stats.edge_count, 3);
assert_eq!(stats.label_count, 2);
let knows = &stats.label_stats["KNOWS"];
assert_eq!(knows.edge_count, 2);
assert_eq!(knows.distinct_sources, 2);
assert_eq!(knows.distinct_targets, 2);
let likes = &stats.label_stats["LIKES"];
assert_eq!(likes.edge_count, 1);
}
#[test]
fn degree_histogram_values() {
let mut csr = CsrIndex::new();
csr.add_edge("a", "L", "b").unwrap();
csr.add_edge("a", "L", "c").unwrap();
csr.add_edge("a", "L", "d").unwrap();
csr.add_edge("b", "L", "c").unwrap();
csr.compact();
let stats = csr.compute_statistics();
assert_eq!(stats.out_degree_histogram.min, 0);
assert_eq!(stats.out_degree_histogram.max, 3);
assert!(stats.out_degree_histogram.avg > 0.0);
}
#[test]
fn label_edge_count_direct() {
let mut csr = CsrIndex::new();
csr.add_edge("a", "KNOWS", "b").unwrap();
csr.add_edge("b", "KNOWS", "c").unwrap();
csr.add_edge("a", "LIKES", "c").unwrap();
csr.compact();
assert_eq!(csr.label_edge_count("KNOWS"), 2);
assert_eq!(csr.label_edge_count("LIKES"), 1);
assert_eq!(csr.label_edge_count("NONEXISTENT"), 0);
}
#[test]
fn label_selectivity_values() {
let mut csr = CsrIndex::new();
csr.add_edge("a", "KNOWS", "b").unwrap();
csr.add_edge("b", "KNOWS", "c").unwrap();
csr.add_edge("a", "LIKES", "c").unwrap();
csr.compact();
let sel_knows = csr.label_selectivity("KNOWS");
let sel_likes = csr.label_selectivity("LIKES");
assert!((sel_knows - 2.0 / 3.0).abs() < 1e-9);
assert!((sel_likes - 1.0 / 3.0).abs() < 1e-9);
assert_eq!(csr.label_selectivity("NONEXISTENT"), 1.0);
}
#[test]
fn statistics_serde_roundtrip() {
let mut csr = CsrIndex::new();
csr.add_edge("a", "KNOWS", "b").unwrap();
csr.compact();
let stats = csr.compute_statistics();
let json = sonic_rs::to_string(&stats).unwrap();
let parsed: GraphStatistics = sonic_rs::from_str(&json).unwrap();
assert_eq!(parsed.node_count, stats.node_count);
assert_eq!(parsed.edge_count, stats.edge_count);
}
#[test]
fn statistics_with_buffer_edges() {
let mut csr = CsrIndex::new();
csr.add_edge("a", "KNOWS", "b").unwrap();
let stats = csr.compute_statistics();
assert_eq!(stats.edge_count, 1);
assert_eq!(stats.label_stats["KNOWS"].edge_count, 1);
}
}