Skip to main content

nodedb_graph/csr/
statistics.rs

1//! Edge table statistics for query planning and optimization.
2//!
3//! Maintains per-label edge counts and degree distribution histograms.
4//! Updated incrementally on `add_edge`/`remove_edge` and fully recomputed
5//! on `compact`. Used by the MATCH pattern compiler for join order
6//! optimization (most selective label first).
7
8use std::collections::HashMap;
9
10use serde::{Deserialize, Serialize};
11
12use super::CsrIndex;
13
14/// Per-label edge statistics.
15#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct LabelStats {
17    /// Number of edges with this label.
18    pub edge_count: usize,
19    /// Number of distinct source nodes that have this label on an outbound edge.
20    pub distinct_sources: usize,
21    /// Number of distinct destination nodes that have this label on an inbound edge.
22    pub distinct_targets: usize,
23}
24
25/// Degree distribution histogram.
26#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct DegreeHistogram {
28    pub min: usize,
29    pub max: usize,
30    pub avg: f64,
31    pub p50: usize,
32    pub p95: usize,
33    pub p99: usize,
34}
35
36/// Complete graph statistics snapshot.
37#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct GraphStatistics {
39    /// Total number of nodes.
40    pub node_count: usize,
41    /// Total number of edges (all labels).
42    pub edge_count: usize,
43    /// Number of distinct edge labels.
44    pub label_count: usize,
45    /// Per-label statistics.
46    pub label_stats: HashMap<String, LabelStats>,
47    /// Out-degree distribution across all nodes.
48    pub out_degree_histogram: DegreeHistogram,
49    /// In-degree distribution across all nodes.
50    pub in_degree_histogram: DegreeHistogram,
51}
52
53impl CsrIndex {
54    /// Compute full graph statistics from the current CSR state.
55    ///
56    /// O(V + E) — iterates all nodes and edges once. Intended for query
57    /// planning, not hot-path execution.
58    pub fn compute_statistics(&self) -> GraphStatistics {
59        let n = self.node_count();
60        if n == 0 {
61            return GraphStatistics {
62                node_count: 0,
63                edge_count: 0,
64                label_count: 0,
65                label_stats: HashMap::new(),
66                out_degree_histogram: DegreeHistogram {
67                    min: 0,
68                    max: 0,
69                    avg: 0.0,
70                    p50: 0,
71                    p95: 0,
72                    p99: 0,
73                },
74                in_degree_histogram: DegreeHistogram {
75                    min: 0,
76                    max: 0,
77                    avg: 0.0,
78                    p50: 0,
79                    p95: 0,
80                    p99: 0,
81                },
82            };
83        }
84
85        // Per-label counters.
86        let mut label_edge_count: HashMap<u16, usize> = HashMap::new();
87        let mut label_sources: HashMap<u16, std::collections::HashSet<u32>> = HashMap::new();
88        let mut label_targets: HashMap<u16, std::collections::HashSet<u32>> = HashMap::new();
89
90        // Degree arrays.
91        let mut out_degrees: Vec<usize> = Vec::with_capacity(n);
92        let mut in_degrees: Vec<usize> = Vec::with_capacity(n);
93
94        let mut total_edges = 0usize;
95
96        for node in 0..n {
97            let node_id = node as u32;
98            let mut out_deg = 0usize;
99            let mut in_deg = 0usize;
100
101            for (lid, dst) in self.iter_out_edges(node_id) {
102                out_deg += 1;
103                total_edges += 1;
104                *label_edge_count.entry(lid).or_insert(0) += 1;
105                label_sources.entry(lid).or_default().insert(node_id);
106                label_targets.entry(lid).or_default().insert(dst);
107            }
108
109            for (_lid, _src) in self.iter_in_edges(node_id) {
110                in_deg += 1;
111            }
112
113            out_degrees.push(out_deg);
114            in_degrees.push(in_deg);
115        }
116
117        // Build per-label stats.
118        let mut label_stats = HashMap::new();
119        for (&lid, &count) in &label_edge_count {
120            let label_name = self.label_name(lid).to_string();
121            label_stats.insert(
122                label_name,
123                LabelStats {
124                    edge_count: count,
125                    distinct_sources: label_sources.get(&lid).map_or(0, |s| s.len()),
126                    distinct_targets: label_targets.get(&lid).map_or(0, |s| s.len()),
127                },
128            );
129        }
130
131        GraphStatistics {
132            node_count: n,
133            edge_count: total_edges,
134            label_count: label_edge_count.len(),
135            label_stats,
136            out_degree_histogram: compute_histogram(&out_degrees),
137            in_degree_histogram: compute_histogram(&in_degrees),
138        }
139    }
140
141    /// Get the edge count for a specific label. O(E) unless cached.
142    ///
143    /// Returns 0 if the label doesn't exist.
144    pub fn label_edge_count(&self, label: &str) -> usize {
145        let Some(lid) = self.label_id(label) else {
146            return 0;
147        };
148
149        let n = self.node_count();
150        let mut count = 0usize;
151        for node in 0..n {
152            for (l, _dst) in self.iter_out_edges(node as u32) {
153                if l == lid {
154                    count += 1;
155                }
156            }
157        }
158        count
159    }
160
161    /// Estimate the selectivity of a label: edge_count / total_edges.
162    ///
163    /// Returns 1.0 for unknown labels (conservative — assume all edges).
164    /// Returns 0.0 for graphs with no edges.
165    pub fn label_selectivity(&self, label: &str) -> f64 {
166        let total = self.edge_count();
167        if total == 0 {
168            return 0.0;
169        }
170        let count = self.label_edge_count(label);
171        if count == 0 {
172            return 1.0; // Unknown label → conservative estimate.
173        }
174        count as f64 / total as f64
175    }
176}
177
178/// Compute degree distribution histogram from a degree array.
179fn compute_histogram(degrees: &[usize]) -> DegreeHistogram {
180    if degrees.is_empty() {
181        return DegreeHistogram {
182            min: 0,
183            max: 0,
184            avg: 0.0,
185            p50: 0,
186            p95: 0,
187            p99: 0,
188        };
189    }
190
191    let mut sorted = degrees.to_vec();
192    sorted.sort_unstable();
193
194    let n = sorted.len();
195    let sum: usize = sorted.iter().sum();
196
197    DegreeHistogram {
198        min: sorted[0],
199        max: sorted[n - 1],
200        avg: sum as f64 / n as f64,
201        p50: sorted[n / 2],
202        p95: sorted[(n as f64 * 0.95) as usize],
203        p99: sorted[((n as f64 * 0.99) as usize).min(n - 1)],
204    }
205}
206
207#[cfg(test)]
208mod tests {
209    use super::*;
210
211    #[test]
212    fn statistics_empty_graph() {
213        let csr = CsrIndex::new();
214        let stats = csr.compute_statistics();
215        assert_eq!(stats.node_count, 0);
216        assert_eq!(stats.edge_count, 0);
217        assert_eq!(stats.label_count, 0);
218    }
219
220    #[test]
221    fn statistics_basic() {
222        let mut csr = CsrIndex::new();
223        csr.add_edge("a", "KNOWS", "b");
224        csr.add_edge("b", "KNOWS", "c");
225        csr.add_edge("a", "LIKES", "c");
226        csr.compact();
227
228        let stats = csr.compute_statistics();
229        assert_eq!(stats.node_count, 3);
230        assert_eq!(stats.edge_count, 3);
231        assert_eq!(stats.label_count, 2);
232
233        let knows = &stats.label_stats["KNOWS"];
234        assert_eq!(knows.edge_count, 2);
235        assert_eq!(knows.distinct_sources, 2);
236        assert_eq!(knows.distinct_targets, 2);
237
238        let likes = &stats.label_stats["LIKES"];
239        assert_eq!(likes.edge_count, 1);
240    }
241
242    #[test]
243    fn degree_histogram_values() {
244        let mut csr = CsrIndex::new();
245        csr.add_edge("a", "L", "b");
246        csr.add_edge("a", "L", "c");
247        csr.add_edge("a", "L", "d");
248        csr.add_edge("b", "L", "c");
249        csr.compact();
250
251        let stats = csr.compute_statistics();
252        assert_eq!(stats.out_degree_histogram.min, 0);
253        assert_eq!(stats.out_degree_histogram.max, 3);
254        assert!(stats.out_degree_histogram.avg > 0.0);
255    }
256
257    #[test]
258    fn label_edge_count_direct() {
259        let mut csr = CsrIndex::new();
260        csr.add_edge("a", "KNOWS", "b");
261        csr.add_edge("b", "KNOWS", "c");
262        csr.add_edge("a", "LIKES", "c");
263        csr.compact();
264
265        assert_eq!(csr.label_edge_count("KNOWS"), 2);
266        assert_eq!(csr.label_edge_count("LIKES"), 1);
267        assert_eq!(csr.label_edge_count("NONEXISTENT"), 0);
268    }
269
270    #[test]
271    fn label_selectivity_values() {
272        let mut csr = CsrIndex::new();
273        csr.add_edge("a", "KNOWS", "b");
274        csr.add_edge("b", "KNOWS", "c");
275        csr.add_edge("a", "LIKES", "c");
276        csr.compact();
277
278        let sel_knows = csr.label_selectivity("KNOWS");
279        let sel_likes = csr.label_selectivity("LIKES");
280
281        assert!((sel_knows - 2.0 / 3.0).abs() < 1e-9);
282        assert!((sel_likes - 1.0 / 3.0).abs() < 1e-9);
283        assert_eq!(csr.label_selectivity("NONEXISTENT"), 1.0);
284    }
285
286    #[test]
287    fn statistics_serde_roundtrip() {
288        let mut csr = CsrIndex::new();
289        csr.add_edge("a", "KNOWS", "b");
290        csr.compact();
291
292        let stats = csr.compute_statistics();
293        let json = sonic_rs::to_string(&stats).unwrap();
294        let parsed: GraphStatistics = sonic_rs::from_str(&json).unwrap();
295        assert_eq!(parsed.node_count, stats.node_count);
296        assert_eq!(parsed.edge_count, stats.edge_count);
297    }
298
299    #[test]
300    fn statistics_with_buffer_edges() {
301        let mut csr = CsrIndex::new();
302        csr.add_edge("a", "KNOWS", "b");
303        // Don't compact — edges in buffer.
304        let stats = csr.compute_statistics();
305        assert_eq!(stats.edge_count, 1);
306        assert_eq!(stats.label_stats["KNOWS"].edge_count, 1);
307    }
308}