Skip to main content

nodedb_graph/csr/
statistics.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Edge table statistics for query planning and optimization.
4//!
5//! Maintains per-label edge counts and degree distribution histograms.
6//! Updated incrementally on `add_edge`/`remove_edge` and fully recomputed
7//! on `compact`. Used by the MATCH pattern compiler for join order
8//! optimization (most selective label first).
9
10use std::collections::HashMap;
11use std::mem::size_of;
12
13use nodedb_mem::EngineId;
14use serde::{Deserialize, Serialize};
15
16use super::CsrIndex;
17use crate::GraphError;
18
19/// Per-label edge statistics.
20#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct LabelStats {
22    /// Number of edges with this label.
23    pub edge_count: usize,
24    /// Number of distinct source nodes that have this label on an outbound edge.
25    pub distinct_sources: usize,
26    /// Number of distinct destination nodes that have this label on an inbound edge.
27    pub distinct_targets: usize,
28}
29
30/// Degree distribution histogram.
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct DegreeHistogram {
33    pub min: usize,
34    pub max: usize,
35    pub avg: f64,
36    pub p50: usize,
37    pub p95: usize,
38    pub p99: usize,
39}
40
41/// Complete graph statistics snapshot.
42#[derive(Debug, Clone, Serialize, Deserialize)]
43pub struct GraphStatistics {
44    /// Total number of nodes.
45    pub node_count: usize,
46    /// Total number of edges (all labels).
47    pub edge_count: usize,
48    /// Number of distinct edge labels.
49    pub label_count: usize,
50    /// Per-label statistics.
51    pub label_stats: HashMap<String, LabelStats>,
52    /// Out-degree distribution across all nodes.
53    pub out_degree_histogram: DegreeHistogram,
54    /// In-degree distribution across all nodes.
55    pub in_degree_histogram: DegreeHistogram,
56}
57
58impl CsrIndex {
59    /// Compute full graph statistics from the current CSR state.
60    ///
61    /// O(V + E) — iterates all nodes and edges once. Intended for query
62    /// planning, not hot-path execution.
63    ///
64    /// # Errors
65    ///
66    /// Returns [`GraphError::MemoryBudget`] if a memory governor is installed
67    /// and the degree-array working set would exceed the `Graph` engine budget.
68    pub fn compute_statistics(&self) -> Result<GraphStatistics, GraphError> {
69        let n = self.node_count();
70        if n == 0 {
71            return Ok(GraphStatistics {
72                node_count: 0,
73                edge_count: 0,
74                label_count: 0,
75                label_stats: HashMap::new(),
76                out_degree_histogram: DegreeHistogram {
77                    min: 0,
78                    max: 0,
79                    avg: 0.0,
80                    p50: 0,
81                    p95: 0,
82                    p99: 0,
83                },
84                in_degree_histogram: DegreeHistogram {
85                    min: 0,
86                    max: 0,
87                    avg: 0.0,
88                    p50: 0,
89                    p95: 0,
90                    p99: 0,
91                },
92            });
93        }
94
95        // Reserve memory for the two degree-distribution scratch arrays.
96        let degree_bytes = 2 * n * size_of::<usize>();
97        let _degree_guard = self
98            .governor
99            .as_ref()
100            .map(|g| g.reserve(EngineId::Graph, degree_bytes))
101            .transpose()?;
102
103        // Per-label counters.
104        let mut label_edge_count: HashMap<u32, usize> = HashMap::new();
105        let mut label_sources: HashMap<u32, std::collections::HashSet<u32>> = HashMap::new();
106        let mut label_targets: HashMap<u32, std::collections::HashSet<u32>> = HashMap::new();
107
108        // Degree arrays.
109        // no-governor: cold statistics scan; degree arrays parallel to node count, governed at stats call site
110        let mut out_degrees: Vec<usize> = Vec::with_capacity(n);
111        // no-governor: cold statistics scan; degree arrays parallel to node count, governed at stats call site
112        let mut in_degrees: Vec<usize> = Vec::with_capacity(n);
113
114        let mut total_edges = 0usize;
115
116        for node in 0..n {
117            let node_id = node as u32;
118            let mut out_deg = 0usize;
119            let mut in_deg = 0usize;
120
121            for (lid, dst) in self.dense_iter_out(node_id) {
122                out_deg += 1;
123                total_edges += 1;
124                *label_edge_count.entry(lid).or_insert(0) += 1;
125                label_sources.entry(lid).or_default().insert(node_id);
126                label_targets.entry(lid).or_default().insert(dst);
127            }
128
129            for (_lid, _src) in self.dense_iter_in(node_id) {
130                in_deg += 1;
131            }
132
133            out_degrees.push(out_deg);
134            in_degrees.push(in_deg);
135        }
136
137        // Build per-label stats.
138        let mut label_stats = HashMap::new();
139        for (&lid, &count) in &label_edge_count {
140            let label_name = self.label_name(lid).to_string();
141            label_stats.insert(
142                label_name,
143                LabelStats {
144                    edge_count: count,
145                    distinct_sources: label_sources.get(&lid).map_or(0, |s| s.len()),
146                    distinct_targets: label_targets.get(&lid).map_or(0, |s| s.len()),
147                },
148            );
149        }
150
151        Ok(GraphStatistics {
152            node_count: n,
153            edge_count: total_edges,
154            label_count: label_edge_count.len(),
155            label_stats,
156            out_degree_histogram: compute_histogram(&out_degrees),
157            in_degree_histogram: compute_histogram(&in_degrees),
158        })
159    }
160
161    /// Get the edge count for a specific label. O(E) unless cached.
162    ///
163    /// Returns 0 if the label doesn't exist.
164    pub fn label_edge_count(&self, label: &str) -> usize {
165        let Some(lid) = self.label_id(label) else {
166            return 0;
167        };
168
169        let n = self.node_count();
170        let mut count = 0usize;
171        for node in 0..n {
172            for (l, _dst) in self.dense_iter_out(node as u32) {
173                if l == lid {
174                    count += 1;
175                }
176            }
177        }
178        count
179    }
180
181    /// Estimate the selectivity of a label: edge_count / total_edges.
182    ///
183    /// Returns 1.0 for unknown labels (conservative — assume all edges).
184    /// Returns 0.0 for graphs with no edges.
185    pub fn label_selectivity(&self, label: &str) -> f64 {
186        let total = self.edge_count();
187        if total == 0 {
188            return 0.0;
189        }
190        let count = self.label_edge_count(label);
191        if count == 0 {
192            return 1.0; // Unknown label → conservative estimate.
193        }
194        count as f64 / total as f64
195    }
196}
197
198/// Compute degree distribution histogram from a degree array.
199fn compute_histogram(degrees: &[usize]) -> DegreeHistogram {
200    if degrees.is_empty() {
201        return DegreeHistogram {
202            min: 0,
203            max: 0,
204            avg: 0.0,
205            p50: 0,
206            p95: 0,
207            p99: 0,
208        };
209    }
210
211    let mut sorted = degrees.to_vec();
212    sorted.sort_unstable();
213
214    let n = sorted.len();
215    let sum: usize = sorted.iter().sum();
216
217    DegreeHistogram {
218        min: sorted[0],
219        max: sorted[n - 1],
220        avg: sum as f64 / n as f64,
221        p50: sorted[n / 2],
222        p95: sorted[(n as f64 * 0.95) as usize],
223        p99: sorted[((n as f64 * 0.99) as usize).min(n - 1)],
224    }
225}
226
227#[cfg(test)]
228mod tests {
229    use super::*;
230
231    #[test]
232    fn statistics_empty_graph() {
233        let csr = CsrIndex::new();
234        let stats = csr.compute_statistics().unwrap();
235        assert_eq!(stats.node_count, 0);
236        assert_eq!(stats.edge_count, 0);
237        assert_eq!(stats.label_count, 0);
238    }
239
240    #[test]
241    fn statistics_basic() {
242        let mut csr = CsrIndex::new();
243        csr.add_edge("a", "KNOWS", "b").unwrap();
244        csr.add_edge("b", "KNOWS", "c").unwrap();
245        csr.add_edge("a", "LIKES", "c").unwrap();
246        csr.compact().expect("no governor, cannot fail");
247
248        let stats = csr.compute_statistics().unwrap();
249        assert_eq!(stats.node_count, 3);
250        assert_eq!(stats.edge_count, 3);
251        assert_eq!(stats.label_count, 2);
252
253        let knows = &stats.label_stats["KNOWS"];
254        assert_eq!(knows.edge_count, 2);
255        assert_eq!(knows.distinct_sources, 2);
256        assert_eq!(knows.distinct_targets, 2);
257
258        let likes = &stats.label_stats["LIKES"];
259        assert_eq!(likes.edge_count, 1);
260    }
261
262    #[test]
263    fn degree_histogram_values() {
264        let mut csr = CsrIndex::new();
265        csr.add_edge("a", "L", "b").unwrap();
266        csr.add_edge("a", "L", "c").unwrap();
267        csr.add_edge("a", "L", "d").unwrap();
268        csr.add_edge("b", "L", "c").unwrap();
269        csr.compact().expect("no governor, cannot fail");
270
271        let stats = csr.compute_statistics().unwrap();
272        assert_eq!(stats.out_degree_histogram.min, 0);
273        assert_eq!(stats.out_degree_histogram.max, 3);
274        assert!(stats.out_degree_histogram.avg > 0.0);
275    }
276
277    #[test]
278    fn label_edge_count_direct() {
279        let mut csr = CsrIndex::new();
280        csr.add_edge("a", "KNOWS", "b").unwrap();
281        csr.add_edge("b", "KNOWS", "c").unwrap();
282        csr.add_edge("a", "LIKES", "c").unwrap();
283        csr.compact().expect("no governor, cannot fail");
284
285        assert_eq!(csr.label_edge_count("KNOWS"), 2);
286        assert_eq!(csr.label_edge_count("LIKES"), 1);
287        assert_eq!(csr.label_edge_count("NONEXISTENT"), 0);
288    }
289
290    #[test]
291    fn label_selectivity_values() {
292        let mut csr = CsrIndex::new();
293        csr.add_edge("a", "KNOWS", "b").unwrap();
294        csr.add_edge("b", "KNOWS", "c").unwrap();
295        csr.add_edge("a", "LIKES", "c").unwrap();
296        csr.compact().expect("no governor, cannot fail");
297
298        let sel_knows = csr.label_selectivity("KNOWS");
299        let sel_likes = csr.label_selectivity("LIKES");
300
301        assert!((sel_knows - 2.0 / 3.0).abs() < 1e-9);
302        assert!((sel_likes - 1.0 / 3.0).abs() < 1e-9);
303        assert_eq!(csr.label_selectivity("NONEXISTENT"), 1.0);
304    }
305
306    #[test]
307    fn statistics_serde_roundtrip() {
308        let mut csr = CsrIndex::new();
309        csr.add_edge("a", "KNOWS", "b").unwrap();
310        csr.compact().expect("no governor, cannot fail");
311
312        let stats = csr.compute_statistics().unwrap();
313        let json = sonic_rs::to_string(&stats).unwrap();
314        let parsed: GraphStatistics = sonic_rs::from_str(&json).unwrap();
315        assert_eq!(parsed.node_count, stats.node_count);
316        assert_eq!(parsed.edge_count, stats.edge_count);
317    }
318
319    #[test]
320    fn statistics_with_buffer_edges() {
321        let mut csr = CsrIndex::new();
322        csr.add_edge("a", "KNOWS", "b").unwrap();
323        // Don't compact — edges in buffer.
324        let stats = csr.compute_statistics().unwrap();
325        assert_eq!(stats.edge_count, 1);
326        assert_eq!(stats.label_stats["KNOWS"].edge_count, 1);
327    }
328}