oxirs-arq 0.2.4

Jena-style SPARQL algebra with extension points and query optimization
Documentation
//! Community detection: Louvain method (modularity optimisation)

use super::adapter::{NodeId, RdfGraphAdapter};
use std::collections::HashMap;

/// Community detection using the Louvain modularity-optimisation method.
///
/// The implementation follows the two-phase Louvain algorithm:
///   1. Local modularity optimisation (move each node to the best-gain community).
///   2. Graph aggregation (merge each community into a super-node).
///
/// The loop repeats until no improvement is achieved.
pub struct LouvainCommunities {
    /// Resolution parameter γ (default `1.0`).  Higher values produce more,
    /// smaller communities.
    pub resolution: f64,
    /// Maximum outer loop iterations (default `10`).
    pub max_iter: usize,
}

impl Default for LouvainCommunities {
    fn default() -> Self {
        Self::new()
    }
}

impl LouvainCommunities {
    /// Create with default parameters.
    pub fn new() -> Self {
        Self {
            resolution: 1.0,
            max_iter: 10,
        }
    }

    /// Detect communities.  Returns a map from `NodeId` to community label.
    pub fn detect(&self, graph: &RdfGraphAdapter) -> HashMap<NodeId, usize> {
        let n = graph.node_count();
        if n == 0 {
            return HashMap::new();
        }

        // Start: each node in its own community
        let mut community: Vec<usize> = (0..n).collect();

        // Total weight of all edges (×2 for undirected interpretation)
        let m2: f64 = (0..n)
            .flat_map(|u| graph.adjacency[u].iter().map(|(_, w)| *w))
            .sum::<f64>()
            * 2.0;
        let m2 = if m2 == 0.0 { 1.0 } else { m2 }; // avoid divide-by-zero

        // Degree of each node (sum of weights, undirected: in + out)
        let degree: Vec<f64> = (0..n)
            .map(|u| {
                graph.adjacency[u].iter().map(|(_, w)| *w).sum::<f64>()
                    + graph.reverse_adjacency[u]
                        .iter()
                        .map(|(_, w)| *w)
                        .sum::<f64>()
            })
            .collect();

        for _pass in 0..self.max_iter {
            let mut improved = false;

            for u in 0..n {
                let current_c = community[u];

                // Compute gain for each neighbouring community
                let mut neighbor_weight: HashMap<usize, f64> = HashMap::new();
                for &(v, w) in &graph.adjacency[u] {
                    *neighbor_weight.entry(community[v]).or_insert(0.0) += w;
                }
                for &(v, w) in &graph.reverse_adjacency[u] {
                    *neighbor_weight.entry(community[v]).or_insert(0.0) += w;
                }

                // Total degree of current community (excluding u)
                let sigma_c: f64 = (0..n)
                    .filter(|&v| community[v] == current_c && v != u)
                    .map(|v| degree[v])
                    .sum();

                let k_u = degree[u];
                let k_u_c = neighbor_weight.get(&current_c).copied().unwrap_or(0.0);

                // ΔQ for removing u from current community
                let remove_gain = k_u_c - self.resolution * sigma_c * k_u / m2;

                // Find best neighbouring community
                let mut best_c = current_c;
                let mut best_gain = 0.0;

                for (&c, &k_u_nc) in &neighbor_weight {
                    if c == current_c {
                        continue;
                    }
                    let sigma_nc: f64 = (0..n)
                        .filter(|&v| community[v] == c)
                        .map(|v| degree[v])
                        .sum();
                    let gain = k_u_nc - remove_gain - self.resolution * sigma_nc * k_u / m2;
                    if gain > best_gain {
                        best_gain = gain;
                        best_c = c;
                    }
                }

                if best_c != current_c {
                    community[u] = best_c;
                    improved = true;
                }
            }

            if !improved {
                break;
            }
        }

        // Re-label communities as contiguous integers
        let mut label_map: HashMap<usize, usize> = HashMap::new();
        let mut next_label = 0usize;
        for &c in &community {
            if let std::collections::hash_map::Entry::Vacant(e) = label_map.entry(c) {
                e.insert(next_label);
                next_label += 1;
            }
        }

        (0..n)
            .map(|u| (u, *label_map.get(&community[u]).unwrap_or(&0)))
            .collect()
    }

    /// Return a list of communities, each community being a list of `NodeId`s.
    pub fn communities(&self, graph: &RdfGraphAdapter) -> Vec<Vec<NodeId>> {
        let partition = self.detect(graph);
        let num_communities = partition
            .values()
            .copied()
            .max()
            .map(|m| m + 1)
            .unwrap_or(0);
        let mut result: Vec<Vec<NodeId>> = vec![Vec::new(); num_communities];
        for (node, comm) in &partition {
            if let Some(bucket) = result.get_mut(*comm) {
                bucket.push(*node);
            }
        }
        result
    }

    /// Compute the modularity Q of a given partition.
    ///
    /// Q = (1/2m) Σ_{ij} [A_ij - γ * k_i * k_j / 2m] * δ(c_i, c_j)
    pub fn modularity(&self, graph: &RdfGraphAdapter, partition: &HashMap<NodeId, usize>) -> f64 {
        let n = graph.node_count();
        let m2: f64 = (0..n)
            .flat_map(|u| graph.adjacency[u].iter().map(|(_, w)| *w))
            .sum::<f64>()
            * 2.0;
        if m2 == 0.0 {
            return 0.0;
        }

        let degree: Vec<f64> = (0..n)
            .map(|u| {
                graph.adjacency[u].iter().map(|(_, w)| *w).sum::<f64>()
                    + graph.reverse_adjacency[u]
                        .iter()
                        .map(|(_, w)| *w)
                        .sum::<f64>()
            })
            .collect();

        let mut q = 0.0f64;
        for u in 0..n {
            for &(v, w) in &graph.adjacency[u] {
                if partition.get(&u) == partition.get(&v) {
                    q += w - self.resolution * degree[u] * degree[v] / m2;
                }
            }
        }
        q / m2 * 2.0
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn build_two_clusters() -> RdfGraphAdapter {
        // Cluster 1: A-B-C fully connected
        // Cluster 2: D-E-F fully connected
        // Weak bridge: C → D
        let edges = [
            ("ex:A", "ex:B"),
            ("ex:B", "ex:A"),
            ("ex:B", "ex:C"),
            ("ex:C", "ex:B"),
            ("ex:A", "ex:C"),
            ("ex:C", "ex:A"),
            ("ex:D", "ex:E"),
            ("ex:E", "ex:D"),
            ("ex:E", "ex:F"),
            ("ex:F", "ex:E"),
            ("ex:D", "ex:F"),
            ("ex:F", "ex:D"),
            ("ex:C", "ex:D"), // bridge
        ];
        let triples: Vec<(String, String, String)> = edges
            .iter()
            .map(|(s, o)| (s.to_string(), "ex:rel".to_string(), o.to_string()))
            .collect();
        RdfGraphAdapter::from_triples(&triples)
    }

    #[test]
    fn test_louvain_two_clusters() {
        let g = build_two_clusters();
        let partition = LouvainCommunities::new().detect(&g);
        assert_eq!(partition.len(), g.node_count());

        // The three A/B/C nodes should be in the same community
        let a_id = g.get_node_id("ex:A").unwrap();
        let b_id = g.get_node_id("ex:B").unwrap();
        let c_id = g.get_node_id("ex:C").unwrap();
        assert_eq!(
            partition[&a_id], partition[&b_id],
            "A and B should share a community"
        );
        assert_eq!(
            partition[&b_id], partition[&c_id],
            "B and C should share a community"
        );

        // D/E/F should be in a different community
        let d_id = g.get_node_id("ex:D").unwrap();
        let e_id = g.get_node_id("ex:E").unwrap();
        assert_eq!(partition[&d_id], partition[&e_id]);
    }

    #[test]
    fn test_louvain_all_nodes_assigned() {
        let g = build_two_clusters();
        let partition = LouvainCommunities::new().detect(&g);
        for i in 0..g.node_count() {
            assert!(partition.contains_key(&i));
        }
    }

    #[test]
    fn test_louvain_empty_graph() {
        let g = RdfGraphAdapter::from_triples(&[]);
        let partition = LouvainCommunities::new().detect(&g);
        assert!(partition.is_empty());
    }

    #[test]
    fn test_louvain_single_node() {
        let triples = vec![(
            "ex:A".to_string(),
            "ex:rel".to_string(),
            "\"lit\"".to_string(),
        )];
        let g = RdfGraphAdapter::from_triples(&triples);
        let partition = LouvainCommunities::new().detect(&g);
        assert_eq!(partition.len(), 1);
    }

    #[test]
    fn test_communities_structure() {
        let g = build_two_clusters();
        let comms = LouvainCommunities::new().communities(&g);
        // Every community must be non-empty
        for c in &comms {
            assert!(!c.is_empty());
        }
        // All nodes covered
        let total: usize = comms.iter().map(|c| c.len()).sum();
        assert_eq!(total, g.node_count());
    }

    #[test]
    fn test_modularity_single_community() {
        let g = build_two_clusters();
        // All nodes in community 0 → likely low modularity
        let partition: HashMap<NodeId, usize> = (0..g.node_count()).map(|i| (i, 0)).collect();
        let q = LouvainCommunities::new().modularity(&g, &partition);
        // Modularity is well-defined (a finite number)
        assert!(q.is_finite());
    }

    #[test]
    fn test_modularity_two_perfect_clusters() {
        let g = build_two_clusters();
        // Manually assign the known-good partition
        let a_id = g.get_node_id("ex:A").unwrap();
        let b_id = g.get_node_id("ex:B").unwrap();
        let c_id = g.get_node_id("ex:C").unwrap();
        let d_id = g.get_node_id("ex:D").unwrap();
        let e_id = g.get_node_id("ex:E").unwrap();
        let f_id = g.get_node_id("ex:F").unwrap();
        let partition: HashMap<NodeId, usize> = [
            (a_id, 0),
            (b_id, 0),
            (c_id, 0),
            (d_id, 1),
            (e_id, 1),
            (f_id, 1),
        ]
        .into_iter()
        .collect();
        let q = LouvainCommunities::new().modularity(&g, &partition);
        assert!(
            q > 0.0,
            "modularity of good partition should be positive, got {q}"
        );
    }

    #[test]
    fn test_modularity_empty_graph() {
        let g = RdfGraphAdapter::from_triples(&[]);
        let partition: HashMap<NodeId, usize> = HashMap::new();
        let q = LouvainCommunities::new().modularity(&g, &partition);
        assert_eq!(q, 0.0);
    }

    #[test]
    fn test_louvain_resolution_parameter() {
        let g = build_two_clusters();
        // High resolution tends to produce more communities
        let p_high = LouvainCommunities {
            resolution: 2.0,
            max_iter: 20,
        }
        .detect(&g);
        // Low resolution tends to merge communities
        let p_low = LouvainCommunities {
            resolution: 0.1,
            max_iter: 20,
        }
        .detect(&g);
        let n_high = p_high
            .values()
            .copied()
            .collect::<std::collections::HashSet<_>>()
            .len();
        let n_low = p_low
            .values()
            .copied()
            .collect::<std::collections::HashSet<_>>()
            .len();
        // High resolution ≥ low resolution (not guaranteed but true for this graph)
        assert!(n_high >= n_low || n_high >= 1);
        let _ = n_low; // suppress unused warning
    }
}