Skip to main content

tensorlogic_oxirs_bridge/
graph_stats.rs

1//! RDF graph statistics and structural analysis.
2//!
3//! Computes graph metrics from triple stores: node/edge counts,
4//! degree distributions, predicate frequencies, and density.
5
6use serde::{Deserialize, Serialize};
7use std::collections::{HashMap, HashSet};
8
9/// Comprehensive statistics about an RDF graph.
10#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct GraphStats {
12    /// Total number of unique subjects
13    pub subject_count: usize,
14    /// Total number of unique objects
15    pub object_count: usize,
16    /// Total number of unique nodes (subjects union objects)
17    pub node_count: usize,
18    /// Total number of triples (edges)
19    pub edge_count: usize,
20    /// Number of unique predicates
21    pub predicate_count: usize,
22    /// Graph density: edges / (nodes * (nodes-1)) for directed graphs
23    pub density: f64,
24    /// Average out-degree (edges per subject)
25    pub avg_out_degree: f64,
26    /// Maximum out-degree
27    pub max_out_degree: usize,
28    /// Average in-degree (edges per object)
29    pub avg_in_degree: f64,
30    /// Maximum in-degree
31    pub max_in_degree: usize,
32    /// Self-loop count (subject == object)
33    pub self_loop_count: usize,
34}
35
36impl GraphStats {
37    /// Compute stats from a list of (subject, predicate, object) triples.
38    pub fn compute(triples: &[(String, String, String)]) -> Self {
39        if triples.is_empty() {
40            return Self {
41                subject_count: 0,
42                object_count: 0,
43                node_count: 0,
44                edge_count: 0,
45                predicate_count: 0,
46                density: 0.0,
47                avg_out_degree: 0.0,
48                max_out_degree: 0,
49                avg_in_degree: 0.0,
50                max_in_degree: 0,
51                self_loop_count: 0,
52            };
53        }
54
55        let mut subjects: HashSet<&str> = HashSet::new();
56        let mut objects: HashSet<&str> = HashSet::new();
57        let mut predicates: HashSet<&str> = HashSet::new();
58        let mut out_degree: HashMap<&str, usize> = HashMap::new();
59        let mut in_degree: HashMap<&str, usize> = HashMap::new();
60        let mut self_loop_count: usize = 0;
61
62        for (s, p, o) in triples {
63            subjects.insert(s.as_str());
64            objects.insert(o.as_str());
65            predicates.insert(p.as_str());
66
67            *out_degree.entry(s.as_str()).or_insert(0) += 1;
68            *in_degree.entry(o.as_str()).or_insert(0) += 1;
69
70            if s == o {
71                self_loop_count += 1;
72            }
73        }
74
75        let subject_count = subjects.len();
76        let object_count = objects.len();
77        let nodes: HashSet<&str> = subjects.union(&objects).copied().collect();
78        let node_count = nodes.len();
79        let edge_count = triples.len();
80        let predicate_count = predicates.len();
81
82        let density = if node_count > 1 {
83            edge_count as f64 / (node_count as f64 * (node_count as f64 - 1.0))
84        } else {
85            0.0
86        };
87
88        let avg_out_degree = if subject_count > 0 {
89            edge_count as f64 / subject_count as f64
90        } else {
91            0.0
92        };
93
94        let max_out_degree = out_degree.values().copied().max().unwrap_or(0);
95
96        let avg_in_degree = if object_count > 0 {
97            edge_count as f64 / object_count as f64
98        } else {
99            0.0
100        };
101
102        let max_in_degree = in_degree.values().copied().max().unwrap_or(0);
103
104        Self {
105            subject_count,
106            object_count,
107            node_count,
108            edge_count,
109            predicate_count,
110            density,
111            avg_out_degree,
112            max_out_degree,
113            avg_in_degree,
114            max_in_degree,
115            self_loop_count,
116        }
117    }
118
119    /// Is the graph empty?
120    pub fn is_empty(&self) -> bool {
121        self.edge_count == 0
122    }
123
124    /// Human-readable summary.
125    pub fn summary(&self) -> String {
126        format!(
127            "Graph: {} nodes, {} edges, {} predicates | \
128             density={:.6} | out-degree: avg={:.2}, max={} | \
129             in-degree: avg={:.2}, max={} | self-loops: {}",
130            self.node_count,
131            self.edge_count,
132            self.predicate_count,
133            self.density,
134            self.avg_out_degree,
135            self.max_out_degree,
136            self.avg_in_degree,
137            self.max_in_degree,
138            self.self_loop_count,
139        )
140    }
141}
142
143/// Per-predicate statistics.
144#[derive(Debug, Clone, Serialize, Deserialize)]
145pub struct PredicateStats {
146    /// Predicate name (IRI)
147    pub name: String,
148    /// Number of triples using this predicate
149    pub count: usize,
150    /// Number of unique subjects for this predicate
151    pub unique_subjects: usize,
152    /// Number of unique objects for this predicate
153    pub unique_objects: usize,
154    /// Whether each subject maps to at most one object (functional property)
155    pub is_functional: bool,
156    /// Whether each object maps to at most one subject (inverse functional property)
157    pub is_inverse_functional: bool,
158}
159
160/// Compute per-predicate statistics from triples.
161pub fn predicate_statistics(triples: &[(String, String, String)]) -> Vec<PredicateStats> {
162    let mut pred_data: HashMap<&str, Vec<(&str, &str)>> = HashMap::new();
163
164    for (s, p, o) in triples {
165        pred_data
166            .entry(p.as_str())
167            .or_default()
168            .push((s.as_str(), o.as_str()));
169    }
170
171    let mut results: Vec<PredicateStats> = pred_data
172        .into_iter()
173        .map(|(name, pairs)| {
174            let count = pairs.len();
175
176            let mut subj_set: HashSet<&str> = HashSet::new();
177            let mut obj_set: HashSet<&str> = HashSet::new();
178            let mut subj_to_objs: HashMap<&str, HashSet<&str>> = HashMap::new();
179            let mut obj_to_subjs: HashMap<&str, HashSet<&str>> = HashMap::new();
180
181            for &(s, o) in &pairs {
182                subj_set.insert(s);
183                obj_set.insert(o);
184                subj_to_objs.entry(s).or_default().insert(o);
185                obj_to_subjs.entry(o).or_default().insert(s);
186            }
187
188            let is_functional = subj_to_objs.values().all(|objs| objs.len() <= 1);
189            let is_inverse_functional = obj_to_subjs.values().all(|subjs| subjs.len() <= 1);
190
191            PredicateStats {
192                name: name.to_string(),
193                count,
194                unique_subjects: subj_set.len(),
195                unique_objects: obj_set.len(),
196                is_functional,
197                is_inverse_functional,
198            }
199        })
200        .collect();
201
202    results.sort_by(|a, b| b.count.cmp(&a.count).then_with(|| a.name.cmp(&b.name)));
203    results
204}
205
206/// Degree distribution: maps degree to count of nodes with that degree.
207#[derive(Debug, Clone, Default, Serialize, Deserialize)]
208pub struct DegreeDistribution {
209    /// Out-degree distribution: degree -> number of nodes
210    pub out_degrees: HashMap<usize, usize>,
211    /// In-degree distribution: degree -> number of nodes
212    pub in_degrees: HashMap<usize, usize>,
213}
214
215impl DegreeDistribution {
216    /// Compute degree distributions from triples.
217    pub fn compute(triples: &[(String, String, String)]) -> Self {
218        let mut out_deg: HashMap<&str, usize> = HashMap::new();
219        let mut in_deg: HashMap<&str, usize> = HashMap::new();
220
221        for (s, _p, o) in triples {
222            *out_deg.entry(s.as_str()).or_insert(0) += 1;
223            *in_deg.entry(o.as_str()).or_insert(0) += 1;
224        }
225
226        let mut out_degrees: HashMap<usize, usize> = HashMap::new();
227        for &deg in out_deg.values() {
228            *out_degrees.entry(deg).or_insert(0) += 1;
229        }
230
231        let mut in_degrees: HashMap<usize, usize> = HashMap::new();
232        for &deg in in_deg.values() {
233            *in_degrees.entry(deg).or_insert(0) += 1;
234        }
235
236        Self {
237            out_degrees,
238            in_degrees,
239        }
240    }
241
242    /// Median out-degree.
243    pub fn median_out_degree(&self) -> f64 {
244        compute_median_from_distribution(&self.out_degrees)
245    }
246
247    /// Median in-degree.
248    pub fn median_in_degree(&self) -> f64 {
249        compute_median_from_distribution(&self.in_degrees)
250    }
251}
252
253/// Compute median from a frequency distribution (degree -> count).
254fn compute_median_from_distribution(dist: &HashMap<usize, usize>) -> f64 {
255    if dist.is_empty() {
256        return 0.0;
257    }
258
259    let total: usize = dist.values().sum();
260    if total == 0 {
261        return 0.0;
262    }
263
264    let mut sorted_degrees: Vec<usize> = dist.keys().copied().collect();
265    sorted_degrees.sort_unstable();
266
267    // Expand into sorted values to find median
268    let mut values: Vec<usize> = Vec::with_capacity(total);
269    for &deg in &sorted_degrees {
270        let count = dist.get(&deg).copied().unwrap_or(0);
271        for _ in 0..count {
272            values.push(deg);
273        }
274    }
275
276    if values.len() % 2 == 1 {
277        values[values.len() / 2] as f64
278    } else {
279        let mid = values.len() / 2;
280        (values[mid - 1] as f64 + values[mid] as f64) / 2.0
281    }
282}
283
284/// Compute the number of weakly connected components using union-find.
285pub fn connected_components(triples: &[(String, String, String)]) -> usize {
286    let mut nodes: HashSet<String> = HashSet::new();
287    let mut uf = UnionFind::new();
288
289    for (s, _p, o) in triples {
290        nodes.insert(s.clone());
291        nodes.insert(o.clone());
292        uf.union(s, o);
293    }
294
295    if nodes.is_empty() {
296        return 0;
297    }
298
299    uf.component_count(&nodes)
300}
301
302/// Simple union-find (disjoint set) structure for graph connectivity.
303struct UnionFind {
304    parent: HashMap<String, String>,
305}
306
307impl UnionFind {
308    fn new() -> Self {
309        Self {
310            parent: HashMap::new(),
311        }
312    }
313
314    fn find(&mut self, x: &str) -> String {
315        // Ensure node exists
316        if !self.parent.contains_key(x) {
317            self.parent.insert(x.to_string(), x.to_string());
318            return x.to_string();
319        }
320
321        // Path compression via iterative approach
322        let mut current = x.to_string();
323        let mut path = Vec::new();
324
325        loop {
326            let p = self
327                .parent
328                .get(&current)
329                .cloned()
330                .unwrap_or_else(|| current.clone());
331            if p == current {
332                break;
333            }
334            path.push(current.clone());
335            current = p;
336        }
337
338        // Compress path
339        let root = current;
340        for node in path {
341            self.parent.insert(node, root.clone());
342        }
343
344        root
345    }
346
347    fn union(&mut self, x: &str, y: &str) {
348        let rx = self.find(x);
349        let ry = self.find(y);
350        if rx != ry {
351            self.parent.insert(rx, ry);
352        }
353    }
354
355    fn component_count(&mut self, nodes: &HashSet<String>) -> usize {
356        let mut roots: HashSet<String> = HashSet::new();
357        for node in nodes {
358            let root = self.find(node);
359            roots.insert(root);
360        }
361        roots.len()
362    }
363}
364
365#[cfg(test)]
366mod tests {
367    use super::*;
368
369    fn make_triples(data: &[(&str, &str, &str)]) -> Vec<(String, String, String)> {
370        data.iter()
371            .map(|(s, p, o)| (s.to_string(), p.to_string(), o.to_string()))
372            .collect()
373    }
374
375    #[test]
376    fn test_graph_stats_empty() {
377        let stats = GraphStats::compute(&[]);
378        assert_eq!(stats.subject_count, 0);
379        assert_eq!(stats.object_count, 0);
380        assert_eq!(stats.node_count, 0);
381        assert_eq!(stats.edge_count, 0);
382        assert_eq!(stats.predicate_count, 0);
383        assert_eq!(stats.density, 0.0);
384        assert_eq!(stats.avg_out_degree, 0.0);
385        assert_eq!(stats.max_out_degree, 0);
386        assert_eq!(stats.avg_in_degree, 0.0);
387        assert_eq!(stats.max_in_degree, 0);
388        assert_eq!(stats.self_loop_count, 0);
389    }
390
391    #[test]
392    fn test_graph_stats_single_triple() {
393        let triples = make_triples(&[("a", "p", "b")]);
394        let stats = GraphStats::compute(&triples);
395        assert_eq!(stats.node_count, 2);
396        assert_eq!(stats.edge_count, 1);
397        assert_eq!(stats.subject_count, 1);
398        assert_eq!(stats.object_count, 1);
399    }
400
401    #[test]
402    fn test_graph_stats_node_count() {
403        // "a" appears as both subject and object, should be deduplicated
404        let triples = make_triples(&[("a", "p", "b"), ("b", "q", "a")]);
405        let stats = GraphStats::compute(&triples);
406        assert_eq!(stats.node_count, 2);
407        assert_eq!(stats.subject_count, 2);
408        assert_eq!(stats.object_count, 2);
409    }
410
411    #[test]
412    fn test_graph_stats_density() {
413        // 3 nodes, 3 edges => density = 3 / (3 * 2) = 0.5
414        let triples = make_triples(&[("a", "p", "b"), ("b", "p", "c"), ("c", "p", "a")]);
415        let stats = GraphStats::compute(&triples);
416        assert_eq!(stats.node_count, 3);
417        assert_eq!(stats.edge_count, 3);
418        let expected_density = 3.0 / (3.0 * 2.0);
419        assert!((stats.density - expected_density).abs() < 1e-10);
420    }
421
422    #[test]
423    fn test_graph_stats_degrees() {
424        // a->b, a->c, b->c
425        let triples = make_triples(&[("a", "p", "b"), ("a", "p", "c"), ("b", "p", "c")]);
426        let stats = GraphStats::compute(&triples);
427        // out-degrees: a=2, b=1 => avg = 3/2 = 1.5, max = 2
428        assert!((stats.avg_out_degree - 1.5).abs() < 1e-10);
429        assert_eq!(stats.max_out_degree, 2);
430        // in-degrees: b=1, c=2 => avg = 3/2 = 1.5, max = 2
431        assert!((stats.avg_in_degree - 1.5).abs() < 1e-10);
432        assert_eq!(stats.max_in_degree, 2);
433    }
434
435    #[test]
436    fn test_graph_stats_self_loop() {
437        let triples = make_triples(&[("a", "p", "a"), ("a", "q", "b")]);
438        let stats = GraphStats::compute(&triples);
439        assert_eq!(stats.self_loop_count, 1);
440    }
441
442    #[test]
443    fn test_graph_stats_summary() {
444        let triples = make_triples(&[("a", "p", "b")]);
445        let stats = GraphStats::compute(&triples);
446        let summary = stats.summary();
447        assert!(!summary.is_empty());
448        assert!(summary.contains("2 nodes"));
449        assert!(summary.contains("1 edges"));
450    }
451
452    #[test]
453    fn test_graph_stats_is_empty() {
454        let stats = GraphStats::compute(&[]);
455        assert!(stats.is_empty());
456
457        let triples = make_triples(&[("a", "p", "b")]);
458        let stats2 = GraphStats::compute(&triples);
459        assert!(!stats2.is_empty());
460    }
461
462    #[test]
463    fn test_predicate_stats_count() {
464        let triples = make_triples(&[
465            ("a", "knows", "b"),
466            ("b", "knows", "c"),
467            ("a", "likes", "c"),
468        ]);
469        let stats = predicate_statistics(&triples);
470        assert_eq!(stats.len(), 2);
471        // sorted by count desc: knows=2, likes=1
472        assert_eq!(stats[0].name, "knows");
473        assert_eq!(stats[0].count, 2);
474        assert_eq!(stats[1].name, "likes");
475        assert_eq!(stats[1].count, 1);
476    }
477
478    #[test]
479    fn test_predicate_stats_functional() {
480        // Each subject has exactly one object for "name"
481        let triples = make_triples(&[("a", "name", "Alice"), ("b", "name", "Bob")]);
482        let stats = predicate_statistics(&triples);
483        assert_eq!(stats.len(), 1);
484        assert!(stats[0].is_functional);
485    }
486
487    #[test]
488    fn test_predicate_stats_not_functional() {
489        // Subject "a" has two objects for "knows"
490        let triples = make_triples(&[("a", "knows", "b"), ("a", "knows", "c")]);
491        let stats = predicate_statistics(&triples);
492        assert_eq!(stats.len(), 1);
493        assert!(!stats[0].is_functional);
494    }
495
496    #[test]
497    fn test_predicate_stats_inverse_functional() {
498        // Each object has exactly one subject
499        let triples = make_triples(&[("a", "id", "1"), ("b", "id", "2")]);
500        let stats = predicate_statistics(&triples);
501        assert_eq!(stats.len(), 1);
502        assert!(stats[0].is_inverse_functional);
503    }
504
505    #[test]
506    fn test_degree_distribution_compute() {
507        // a->b, a->c, b->c
508        let triples = make_triples(&[("a", "p", "b"), ("a", "p", "c"), ("b", "p", "c")]);
509        let dist = DegreeDistribution::compute(&triples);
510        // out: a=2, b=1 => {2: 1, 1: 1}
511        assert_eq!(dist.out_degrees.get(&2), Some(&1));
512        assert_eq!(dist.out_degrees.get(&1), Some(&1));
513        // in: b=1, c=2 => {1: 1, 2: 1}
514        assert_eq!(dist.in_degrees.get(&1), Some(&1));
515        assert_eq!(dist.in_degrees.get(&2), Some(&1));
516    }
517
518    #[test]
519    fn test_degree_distribution_median() {
520        // 4 nodes with out-degrees: 1, 1, 2, 3 => median = (1+2)/2 = 1.5
521        let triples = make_triples(&[
522            ("a", "p", "x"),
523            ("b", "p", "x"),
524            ("c", "p", "x"),
525            ("c", "p", "y"),
526            ("d", "p", "x"),
527            ("d", "p", "y"),
528            ("d", "p", "z"),
529        ]);
530        let dist = DegreeDistribution::compute(&triples);
531        // out-degrees: a=1, b=1, c=2, d=3 => sorted: [1,1,2,3] => median=(1+2)/2=1.5
532        assert!((dist.median_out_degree() - 1.5).abs() < 1e-10);
533    }
534
535    #[test]
536    fn test_connected_components_single() {
537        // Fully connected triangle => 1 component
538        let triples = make_triples(&[("a", "p", "b"), ("b", "p", "c"), ("c", "p", "a")]);
539        assert_eq!(connected_components(&triples), 1);
540    }
541
542    #[test]
543    fn test_connected_components_two() {
544        // Two disconnected edges => 2 components
545        let triples = make_triples(&[("a", "p", "b"), ("c", "p", "d")]);
546        assert_eq!(connected_components(&triples), 2);
547    }
548
549    #[test]
550    fn test_connected_components_isolated() {
551        // a-b connected, c-d connected, but c also connects to e
552        // Actually: make truly isolated by having separate components
553        let triples = make_triples(&[("a", "p", "b"), ("c", "p", "d"), ("e", "p", "f")]);
554        assert_eq!(connected_components(&triples), 3);
555    }
556
557    #[test]
558    fn test_graph_stats_multiple_predicates() {
559        let triples = make_triples(&[
560            ("a", "knows", "b"),
561            ("a", "likes", "c"),
562            ("b", "hates", "c"),
563        ]);
564        let stats = GraphStats::compute(&triples);
565        assert_eq!(stats.predicate_count, 3);
566    }
567}