Skip to main content

codemem_storage/graph/
algorithms.rs

1use super::GraphEngine;
2use codemem_core::{Edge, GraphNode, NodeKind, RelationshipType};
3use petgraph::graph::NodeIndex;
4use petgraph::Direction;
5use std::collections::{HashMap, HashSet, VecDeque};
6
7impl GraphEngine {
8    /// Compute PageRank scores for nodes in a single namespace using power iteration.
9    ///
10    /// Only nodes belonging to `namespace` participate. Edges that cross into other
11    /// namespaces are ignored so that unrelated projects cannot inflate or deflate
12    /// centrality scores within this namespace.
13    ///
14    /// Returns a map from node ID to PageRank score (only for nodes in the namespace).
15    pub fn pagerank_for_namespace(
16        &self,
17        namespace: &str,
18        damping: f64,
19        iterations: usize,
20        tolerance: f64,
21    ) -> HashMap<String, f64> {
22        // Collect petgraph indices for nodes that belong to this namespace.
23        let ns_indices: Vec<NodeIndex> = self
24            .graph
25            .node_indices()
26            .filter(|&idx| {
27                self.graph
28                    .node_weight(idx)
29                    .and_then(|id| self.nodes.get(id))
30                    .and_then(|n| n.namespace.as_deref())
31                    == Some(namespace)
32            })
33            .collect();
34
35        let n = ns_indices.len();
36        if n == 0 {
37            tracing::debug!(
38                namespace = %namespace,
39                "PageRank requested for namespace with no nodes"
40            );
41            return HashMap::new();
42        }
43
44        let ns_id_set: HashSet<NodeIndex> = ns_indices.iter().copied().collect();
45
46        let idx_pos: HashMap<NodeIndex, usize> = ns_indices
47            .iter()
48            .enumerate()
49            .map(|(i, &idx)| (idx, i))
50            .collect();
51
52        let nf = n as f64;
53        let initial = 1.0 / nf;
54        let mut scores = vec![initial; n];
55
56        // Out-degree counting only edges that stay within the namespace.
57        // Edges leaving the namespace are ignored — their score contribution
58        // is lost from the namespace (a form of implicit sink), keeping scores
59        // local to the namespace for isolated PageRank computation.
60        let out_degree: Vec<usize> = ns_indices
61            .iter()
62            .map(|&idx| {
63                self.graph
64                    .neighbors_directed(idx, Direction::Outgoing)
65                    .filter(|nb| ns_id_set.contains(nb))
66                    .count()
67            })
68            .collect();
69
70        for _ in 0..iterations {
71            let mut new_scores = vec![(1.0 - damping) / nf; n];
72
73            for (i, &idx) in ns_indices.iter().enumerate() {
74                let deg = out_degree[i];
75                if deg == 0 {
76                    // No in-namespace edges: distribute score evenly within namespace.
77                    // (Note: may have cross-namespace edges, but they don't contribute
78                    // to in-namespace PageRank; score remains in namespace.)
79                    let share = damping * scores[i] / nf;
80                    for ns in new_scores.iter_mut() {
81                        *ns += share;
82                    }
83                } else {
84                    let share = damping * scores[i] / deg as f64;
85                    for neighbor in self.graph.neighbors_directed(idx, Direction::Outgoing) {
86                        if let Some(&pos) = idx_pos.get(&neighbor) {
87                            new_scores[pos] += share;
88                        }
89                    }
90                }
91            }
92
93            let diff: f64 = scores
94                .iter()
95                .zip(new_scores.iter())
96                .map(|(a, b)| (a - b).abs())
97                .sum();
98
99            scores = new_scores;
100
101            if diff < tolerance {
102                break;
103            }
104        }
105
106        ns_indices
107            .iter()
108            .enumerate()
109            .filter_map(|(i, &idx)| {
110                self.graph
111                    .node_weight(idx)
112                    .map(|id| (id.clone(), scores[i]))
113            })
114            .collect()
115    }
116
117    /// Compute PageRank scores for all nodes using power iteration.
118    ///
119    /// - `damping`: probability of following an edge (default 0.85)
120    /// - `iterations`: max number of power iterations (default 100)
121    /// - `tolerance`: convergence threshold (default 1e-6)
122    ///
123    /// Returns a map from node ID to PageRank score.
124    pub fn pagerank(
125        &self,
126        damping: f64,
127        iterations: usize,
128        tolerance: f64,
129    ) -> HashMap<String, f64> {
130        let n = self.graph.node_count();
131        if n == 0 {
132            return HashMap::new();
133        }
134
135        let nf = n as f64;
136        let initial = 1.0 / nf;
137
138        // Collect all node indices in a stable order
139        let indices: Vec<NodeIndex> = self.graph.node_indices().collect();
140        let idx_pos: HashMap<NodeIndex, usize> = indices
141            .iter()
142            .enumerate()
143            .map(|(i, &idx)| (idx, i))
144            .collect();
145
146        let mut scores = vec![initial; n];
147
148        // Precompute out-degrees
149        let out_degree: Vec<usize> = indices
150            .iter()
151            .map(|&idx| {
152                self.graph
153                    .neighbors_directed(idx, Direction::Outgoing)
154                    .count()
155            })
156            .collect();
157
158        for _ in 0..iterations {
159            let mut new_scores = vec![(1.0 - damping) / nf; n];
160
161            // Distribute rank from each node to its out-neighbors
162            for (i, &idx) in indices.iter().enumerate() {
163                let deg = out_degree[i];
164                if deg == 0 {
165                    // Dangling node: distribute evenly to all nodes
166                    let share = damping * scores[i] / nf;
167                    for ns in new_scores.iter_mut() {
168                        *ns += share;
169                    }
170                } else {
171                    let share = damping * scores[i] / deg as f64;
172                    for neighbor in self.graph.neighbors_directed(idx, Direction::Outgoing) {
173                        if let Some(&pos) = idx_pos.get(&neighbor) {
174                            new_scores[pos] += share;
175                        }
176                    }
177                }
178            }
179
180            // Check convergence
181            let diff: f64 = scores
182                .iter()
183                .zip(new_scores.iter())
184                .map(|(a, b)| (a - b).abs())
185                .sum();
186
187            scores = new_scores;
188
189            if diff < tolerance {
190                break;
191            }
192        }
193
194        // Map back to node IDs
195        indices
196            .iter()
197            .enumerate()
198            .filter_map(|(i, &idx)| {
199                self.graph
200                    .node_weight(idx)
201                    .map(|id| (id.clone(), scores[i]))
202            })
203            .collect()
204    }
205
206    /// Compute Personalized PageRank with custom teleport weights.
207    ///
208    /// `seed_weights` maps node IDs to teleport probabilities (will be normalized).
209    /// Nodes not in seed_weights get zero teleport probability.
210    ///
211    /// Used for blast-radius analysis and HippoRAG-2-style retrieval.
212    #[cfg(test)]
213    pub fn personalized_pagerank(
214        &self,
215        seed_weights: &HashMap<String, f64>,
216        damping: f64,
217        iterations: usize,
218        tolerance: f64,
219    ) -> HashMap<String, f64> {
220        let n = self.graph.node_count();
221        if n == 0 {
222            return HashMap::new();
223        }
224
225        let nf = n as f64;
226
227        let indices: Vec<NodeIndex> = self.graph.node_indices().collect();
228        let idx_pos: HashMap<NodeIndex, usize> = indices
229            .iter()
230            .enumerate()
231            .map(|(i, &idx)| (idx, i))
232            .collect();
233
234        // Build and normalize the teleport vector
235        let mut teleport = vec![0.0f64; n];
236        let mut teleport_sum = 0.0;
237        for (i, &idx) in indices.iter().enumerate() {
238            if let Some(node_id) = self.graph.node_weight(idx) {
239                if let Some(&w) = seed_weights.get(node_id) {
240                    teleport[i] = w;
241                    teleport_sum += w;
242                }
243            }
244        }
245        // Normalize; if no seeds provided, fall back to uniform
246        if teleport_sum > 0.0 {
247            for t in teleport.iter_mut() {
248                *t /= teleport_sum;
249            }
250        } else {
251            for t in teleport.iter_mut() {
252                *t = 1.0 / nf;
253            }
254        }
255
256        let initial = 1.0 / nf;
257        let mut scores = vec![initial; n];
258
259        let out_degree: Vec<usize> = indices
260            .iter()
261            .map(|&idx| {
262                self.graph
263                    .neighbors_directed(idx, Direction::Outgoing)
264                    .count()
265            })
266            .collect();
267
268        for _ in 0..iterations {
269            let mut new_scores: Vec<f64> = teleport.iter().map(|&t| (1.0 - damping) * t).collect();
270
271            for (i, &idx) in indices.iter().enumerate() {
272                let deg = out_degree[i];
273                if deg == 0 {
274                    // Dangling node: distribute to teleport targets
275                    let share = damping * scores[i];
276                    for (j, t) in teleport.iter().enumerate() {
277                        new_scores[j] += share * t;
278                    }
279                } else {
280                    let share = damping * scores[i] / deg as f64;
281                    for neighbor in self.graph.neighbors_directed(idx, Direction::Outgoing) {
282                        if let Some(&pos) = idx_pos.get(&neighbor) {
283                            new_scores[pos] += share;
284                        }
285                    }
286                }
287            }
288
289            let diff: f64 = scores
290                .iter()
291                .zip(new_scores.iter())
292                .map(|(a, b)| (a - b).abs())
293                .sum();
294
295            scores = new_scores;
296
297            if diff < tolerance {
298                break;
299            }
300        }
301
302        indices
303            .iter()
304            .enumerate()
305            .filter_map(|(i, &idx)| {
306                self.graph
307                    .node_weight(idx)
308                    .map(|id| (id.clone(), scores[i]))
309            })
310            .collect()
311    }
312
313    /// Detect communities using the Louvain algorithm.
314    ///
315    /// Treats the directed graph as undirected for modularity computation.
316    /// `resolution` controls community granularity (1.0 = standard modularity).
317    /// Returns groups of node IDs, one group per community.
318    pub fn louvain_communities(&self, resolution: f64) -> Vec<Vec<String>> {
319        let n = self.graph.node_count();
320        if n == 0 {
321            return Vec::new();
322        }
323
324        let indices: Vec<NodeIndex> = self.graph.node_indices().collect();
325        let idx_pos: HashMap<NodeIndex, usize> = indices
326            .iter()
327            .enumerate()
328            .map(|(i, &idx)| (idx, i))
329            .collect();
330
331        // Build a lookup from petgraph EdgeIndex -> RelationshipType.
332        // Keyed by edge ID (not src/dst pair) to handle parallel edges correctly:
333        // two edges between the same node pair with different relationships
334        // (e.g., Calls + Inherits) must each get their own multiplier.
335        let edge_rel_by_idx: HashMap<petgraph::graph::EdgeIndex, RelationshipType> = {
336            // Reverse map: (src_node_id, dst_node_id, weight) -> RelationshipType
337            // We match petgraph edges to our Edge structs by endpoint node IDs.
338            // For parallel edges, we iterate self.edges and build a multimap.
339            let mut src_dst_rels: HashMap<(&str, &str), Vec<RelationshipType>> = HashMap::new();
340            for e in self.edges.values() {
341                src_dst_rels
342                    .entry((e.src.as_str(), e.dst.as_str()))
343                    .or_default()
344                    .push(e.relationship);
345            }
346            let mut lookup = HashMap::new();
347            for edge_ref in self.graph.edge_indices() {
348                if let Some((src_idx, dst_idx)) = self.graph.edge_endpoints(edge_ref) {
349                    if let (Some(src_id), Some(dst_id)) = (
350                        self.graph.node_weight(src_idx),
351                        self.graph.node_weight(dst_idx),
352                    ) {
353                        if let Some(rels) =
354                            src_dst_rels.get_mut(&(src_id.as_str(), dst_id.as_str()))
355                        {
356                            if let Some(rel) = rels.pop() {
357                                lookup.insert(edge_ref, rel);
358                            }
359                        }
360                    }
361                }
362            }
363            lookup
364        };
365
366        // Build undirected adjacency with weights.
367        // Deduplicate bidirectional edges: for A->B and B->A, merge into one
368        // undirected edge with combined weight.
369        // Heritage edges (Extends, Implements, Inherits) get a 0.5x multiplier
370        // to reduce coupling across inheritance boundaries in community detection.
371        let mut undirected_weights: HashMap<(usize, usize), f64> = HashMap::new();
372        for edge_ref in self.graph.edge_indices() {
373            if let Some((src_idx, dst_idx)) = self.graph.edge_endpoints(edge_ref) {
374                let w = self.graph[edge_ref];
375                if let (Some(&si), Some(&di)) = (idx_pos.get(&src_idx), idx_pos.get(&dst_idx)) {
376                    let multiplier = edge_rel_by_idx
377                        .get(&edge_ref)
378                        .map(|rel| match rel {
379                            RelationshipType::Extends
380                            | RelationshipType::Implements
381                            | RelationshipType::Inherits => 0.5,
382                            _ => 1.0,
383                        })
384                        .unwrap_or(1.0);
385
386                    let key = if si <= di { (si, di) } else { (di, si) };
387                    *undirected_weights.entry(key).or_insert(0.0) += w * multiplier;
388                }
389            }
390        }
391
392        let mut adj: Vec<Vec<(usize, f64)>> = vec![Vec::new(); n];
393        let mut total_weight = 0.0;
394
395        for (&(si, di), &w) in &undirected_weights {
396            adj[si].push((di, w));
397            if si != di {
398                adj[di].push((si, w));
399            }
400            total_weight += w;
401        }
402
403        if total_weight == 0.0 {
404            // No edges: each node is its own community
405            return indices
406                .iter()
407                .filter_map(|&idx| self.graph.node_weight(idx).map(|id| vec![id.clone()]))
408                .collect();
409        }
410
411        // m = total undirected edge weight
412        let m = total_weight;
413        let m2 = 2.0 * m;
414
415        // Weighted degree of each node (sum of incident undirected edge weights)
416        let k: Vec<f64> = (0..n)
417            .map(|i| adj[i].iter().map(|&(_, w)| w).sum())
418            .collect();
419
420        // Initial assignment: each node in its own community
421        let mut community: Vec<usize> = (0..n).collect();
422
423        // sigma_tot[c] = sum of degrees of nodes in community c.
424        // Maintained incrementally to avoid O(n^2) per pass.
425        let mut sigma_tot: Vec<f64> = k.clone();
426
427        // Iteratively move nodes to improve modularity
428        let mut improved = true;
429        let max_passes = 100;
430        let mut pass = 0;
431
432        while improved && pass < max_passes {
433            improved = false;
434            pass += 1;
435
436            for i in 0..n {
437                let current_comm = community[i];
438                let ki = k[i];
439
440                // Compute weights to each neighboring community
441                let mut comm_weights: HashMap<usize, f64> = HashMap::new();
442                for &(j, w) in &adj[i] {
443                    *comm_weights.entry(community[j]).or_insert(0.0) += w;
444                }
445
446                // Standard Louvain delta-Q formula:
447                // delta_Q = [w_in_new/m - resolution * ki * sigma_new / m2]
448                //         - [w_in_current/m - resolution * ki * (sigma_current - ki) / m2]
449                let w_in_current = comm_weights.get(&current_comm).copied().unwrap_or(0.0);
450                let sigma_current = sigma_tot[current_comm];
451                let remove_cost =
452                    w_in_current / m - resolution * ki * (sigma_current - ki) / (m2 * m);
453
454                // Find best community to move to
455                let mut best_comm = current_comm;
456                let mut best_gain = 0.0;
457
458                for (&comm, &w_in_comm) in &comm_weights {
459                    if comm == current_comm {
460                        continue;
461                    }
462                    let sigma_comm = sigma_tot[comm];
463                    let gain =
464                        w_in_comm / m - resolution * ki * sigma_comm / (m2 * m) - remove_cost;
465                    if gain > best_gain {
466                        best_gain = gain;
467                        best_comm = comm;
468                    }
469                }
470
471                if best_comm != current_comm {
472                    // Update sigma_tot incrementally
473                    sigma_tot[current_comm] -= ki;
474                    sigma_tot[best_comm] += ki;
475                    community[i] = best_comm;
476                    improved = true;
477                }
478            }
479        }
480
481        // Group nodes by community
482        let mut groups: HashMap<usize, Vec<String>> = HashMap::new();
483        for (i, &idx) in indices.iter().enumerate() {
484            if let Some(node_id) = self.graph.node_weight(idx) {
485                groups
486                    .entry(community[i])
487                    .or_default()
488                    .push(node_id.clone());
489            }
490        }
491
492        let mut result: Vec<Vec<String>> = groups.into_values().collect();
493        for group in result.iter_mut() {
494            group.sort();
495        }
496        result.sort();
497        result
498    }
499
500    /// Compute betweenness centrality for all nodes using Brandes' algorithm.
501    ///
502    /// For graphs with more than 1000 nodes, samples sqrt(n) source nodes
503    /// for approximate computation.
504    ///
505    /// Returns a map from node ID to betweenness centrality score (normalized by
506    /// 1/((n-1)(n-2)) for directed graphs).
507    pub fn betweenness_centrality(&self) -> HashMap<String, f64> {
508        let n = self.graph.node_count();
509        if n <= 2 {
510            return self
511                .graph
512                .node_indices()
513                .filter_map(|idx| self.graph.node_weight(idx).map(|id| (id.clone(), 0.0)))
514                .collect();
515        }
516
517        let indices: Vec<NodeIndex> = self.graph.node_indices().collect();
518        let idx_pos: HashMap<NodeIndex, usize> = indices
519            .iter()
520            .enumerate()
521            .map(|(i, &idx)| (idx, i))
522            .collect();
523
524        let mut centrality = vec![0.0f64; n];
525
526        // Determine source nodes (sample for large graphs)
527        let sources: Vec<usize> = if n > 1000 {
528            let sample_size = (n as f64).sqrt() as usize;
529            // Deterministic sampling: evenly spaced
530            let step = n / sample_size;
531            (0..sample_size).map(|i| i * step).collect()
532        } else {
533            (0..n).collect()
534        };
535
536        let scale = if n > 1000 {
537            n as f64 / sources.len() as f64
538        } else {
539            1.0
540        };
541
542        for &s in &sources {
543            // Brandes' algorithm from source s
544            let mut stack: Vec<usize> = Vec::new();
545            let mut predecessors: Vec<Vec<usize>> = vec![Vec::new(); n];
546            let mut sigma = vec![0.0f64; n]; // number of shortest paths
547            sigma[s] = 1.0;
548            let mut dist: Vec<i64> = vec![-1; n];
549            dist[s] = 0;
550
551            let mut queue: VecDeque<usize> = VecDeque::new();
552            queue.push_back(s);
553
554            while let Some(v) = queue.pop_front() {
555                stack.push(v);
556                let v_idx = indices[v];
557                for neighbor in self.graph.neighbors_directed(v_idx, Direction::Outgoing) {
558                    if let Some(&w) = idx_pos.get(&neighbor) {
559                        if dist[w] < 0 {
560                            dist[w] = dist[v] + 1;
561                            queue.push_back(w);
562                        }
563                        if dist[w] == dist[v] + 1 {
564                            sigma[w] += sigma[v];
565                            predecessors[w].push(v);
566                        }
567                    }
568                }
569            }
570
571            let mut delta = vec![0.0f64; n];
572            while let Some(w) = stack.pop() {
573                for &v in &predecessors[w] {
574                    delta[v] += (sigma[v] / sigma[w]) * (1.0 + delta[w]);
575                }
576                if w != s {
577                    centrality[w] += delta[w];
578                }
579            }
580        }
581
582        // Apply sampling scale and normalize
583        let norm = ((n - 1) * (n - 2)) as f64;
584        indices
585            .iter()
586            .enumerate()
587            .filter_map(|(i, &idx)| {
588                self.graph
589                    .node_weight(idx)
590                    .map(|id| (id.clone(), centrality[i] * scale / norm))
591            })
592            .collect()
593    }
594
595    /// Find all strongly connected components using Tarjan's algorithm.
596    ///
597    /// Returns groups of node IDs. Each group is a strongly connected component
598    /// where every node can reach every other node via directed edges.
599    pub fn strongly_connected_components(&self) -> Vec<Vec<String>> {
600        let sccs = petgraph::algo::tarjan_scc(&self.graph);
601
602        let mut result: Vec<Vec<String>> = sccs
603            .into_iter()
604            .map(|component| {
605                let mut ids: Vec<String> = component
606                    .into_iter()
607                    .filter_map(|idx| self.graph.node_weight(idx).cloned())
608                    .collect();
609                ids.sort();
610                ids
611            })
612            .collect();
613
614        result.sort();
615        result
616    }
617
618    /// Compute topological layers using Kahn's algorithm.
619    ///
620    /// Returns layers where all nodes in layer i have no dependencies on nodes
621    /// in layer i or later. For cyclic graphs, SCCs are condensed into single
622    /// super-nodes first, then the resulting DAG is topologically sorted.
623    ///
624    /// Each inner Vec contains the node IDs at that layer.
625    pub fn topological_layers(&self) -> Vec<Vec<String>> {
626        let n = self.graph.node_count();
627        if n == 0 {
628            return Vec::new();
629        }
630
631        let indices: Vec<NodeIndex> = self.graph.node_indices().collect();
632        let idx_pos: HashMap<NodeIndex, usize> = indices
633            .iter()
634            .enumerate()
635            .map(|(i, &idx)| (idx, i))
636            .collect();
637
638        // Step 1: Find SCCs
639        let sccs = petgraph::algo::tarjan_scc(&self.graph);
640
641        // Map each node position to its SCC index
642        let mut node_to_scc = vec![0usize; n];
643        for (scc_idx, scc) in sccs.iter().enumerate() {
644            for &node_idx in scc {
645                if let Some(&pos) = idx_pos.get(&node_idx) {
646                    node_to_scc[pos] = scc_idx;
647                }
648            }
649        }
650
651        let num_sccs = sccs.len();
652
653        // Step 2: Build condensed DAG (SCC graph)
654        let mut condensed_adj: Vec<HashSet<usize>> = vec![HashSet::new(); num_sccs];
655        let mut condensed_in_degree = vec![0usize; num_sccs];
656
657        for &idx in &indices {
658            if let Some(&src_pos) = idx_pos.get(&idx) {
659                let src_scc = node_to_scc[src_pos];
660                for neighbor in self.graph.neighbors_directed(idx, Direction::Outgoing) {
661                    if let Some(&dst_pos) = idx_pos.get(&neighbor) {
662                        let dst_scc = node_to_scc[dst_pos];
663                        if src_scc != dst_scc && condensed_adj[src_scc].insert(dst_scc) {
664                            condensed_in_degree[dst_scc] += 1;
665                        }
666                    }
667                }
668            }
669        }
670
671        // Step 3: Kahn's algorithm on the condensed DAG
672        let mut queue: VecDeque<usize> = VecDeque::new();
673        for (i, &deg) in condensed_in_degree.iter().enumerate().take(num_sccs) {
674            if deg == 0 {
675                queue.push_back(i);
676            }
677        }
678
679        let mut scc_layers: Vec<Vec<usize>> = Vec::new();
680        while !queue.is_empty() {
681            let mut layer = Vec::new();
682            let mut next_queue = VecDeque::new();
683
684            while let Some(scc_idx) = queue.pop_front() {
685                layer.push(scc_idx);
686                for &neighbor_scc in &condensed_adj[scc_idx] {
687                    condensed_in_degree[neighbor_scc] -= 1;
688                    if condensed_in_degree[neighbor_scc] == 0 {
689                        next_queue.push_back(neighbor_scc);
690                    }
691                }
692            }
693
694            scc_layers.push(layer);
695            queue = next_queue;
696        }
697
698        // Step 4: Expand SCC layers back to node IDs
699        let mut result: Vec<Vec<String>> = Vec::new();
700        for scc_layer in scc_layers {
701            let mut layer_nodes: Vec<String> = Vec::new();
702            for scc_idx in scc_layer {
703                for &node_idx in &sccs[scc_idx] {
704                    if let Some(id) = self.graph.node_weight(node_idx) {
705                        layer_nodes.push(id.clone());
706                    }
707                }
708            }
709            layer_nodes.sort();
710            result.push(layer_nodes);
711        }
712
713        result
714    }
715
716    /// Return top-N nodes by centrality and edges between them.
717    /// Optionally filter by namespace and/or node kinds.
718    ///
719    /// Non-structural edges (CALLS, IMPORTS, INHERITS, IMPLEMENTS, DEPENDS_ON)
720    /// from top-N nodes pull their targets into the result so that these
721    /// relationship types are visible in the UI graph.
722    pub fn subgraph_top_n(
723        &self,
724        n: usize,
725        namespace: Option<&str>,
726        kinds: Option<&[NodeKind]>,
727    ) -> (Vec<GraphNode>, Vec<Edge>) {
728        let ns_filter = |node: &&GraphNode| -> bool {
729            if let Some(ns) = namespace {
730                node.namespace.as_deref() == Some(ns)
731            } else {
732                true
733            }
734        };
735        let kind_filter = |node: &&GraphNode| -> bool {
736            if let Some(k) = kinds {
737                k.contains(&node.kind)
738            } else {
739                true
740            }
741        };
742
743        let mut candidates: Vec<&GraphNode> = self
744            .nodes
745            .values()
746            .filter(ns_filter)
747            .filter(kind_filter)
748            .collect();
749
750        // Sort by centrality descending
751        candidates.sort_by(|a, b| {
752            b.centrality
753                .partial_cmp(&a.centrality)
754                .unwrap_or(std::cmp::Ordering::Equal)
755        });
756
757        // Take top N
758        candidates.truncate(n);
759
760        let mut top_ids: HashSet<&str> = candidates.iter().map(|node| node.id.as_str()).collect();
761
762        // Expand: for non-structural edges from top-N nodes, pull in the
763        // other endpoint so CALLS/IMPORTS/etc. edges appear in the result.
764        // Budget: allow up to 20% extra nodes to keep the response bounded.
765        let budget = (n / 5).max(1);
766        let mut extra_ids: Vec<&str> = Vec::new();
767        for edge in self.edges.values() {
768            if extra_ids.len() >= budget {
769                break;
770            }
771            if edge.relationship == RelationshipType::Contains
772                || edge.relationship == RelationshipType::PartOf
773            {
774                continue;
775            }
776            let (in_top, other) = if top_ids.contains(edge.src.as_str()) {
777                (true, edge.dst.as_str())
778            } else if top_ids.contains(edge.dst.as_str()) {
779                (true, edge.src.as_str())
780            } else {
781                (false, "")
782            };
783            if in_top && !top_ids.contains(other) {
784                if let Some(node) = self.nodes.get(other) {
785                    if ns_filter(&node) && kind_filter(&node) {
786                        extra_ids.push(other);
787                        top_ids.insert(other);
788                    }
789                }
790            }
791        }
792
793        let mut nodes_vec: Vec<GraphNode> = candidates.into_iter().cloned().collect();
794        for id in &extra_ids {
795            if let Some(node) = self.nodes.get(*id) {
796                nodes_vec.push(node.clone());
797            }
798        }
799
800        // Collect edges where both src and dst are in the expanded set
801        let edges_vec: Vec<Edge> = self
802            .edges
803            .values()
804            .filter(|edge| {
805                top_ids.contains(edge.src.as_str()) && top_ids.contains(edge.dst.as_str())
806            })
807            .cloned()
808            .collect();
809
810        (nodes_vec, edges_vec)
811    }
812
813    /// Label a community by the most common parent directories of its member nodes.
814    ///
815    /// Takes a list of node IDs, looks up their `file_path` payloads, extracts
816    /// parent directory names (second-to-last path component), and returns a label.
817    /// If all members share the same directory, returns that name.
818    /// If mixed, combines the two most frequent directories with `+`.
819    /// Returns `"unknown"` if no file paths are found.
820    pub fn label_community(&self, member_ids: &[&str]) -> String {
821        let mut dir_counts: HashMap<&str, usize> = HashMap::new();
822
823        for &node_id in member_ids {
824            if let Some(node) = self.nodes.get(node_id) {
825                if let Some(file_path) = node.payload.get("file_path").and_then(|v| v.as_str()) {
826                    // Extract parent directory name (second-to-last path component)
827                    if let Some(dir) = file_path.rsplit('/').nth(1) {
828                        *dir_counts.entry(dir).or_insert(0) += 1;
829                    }
830                }
831            }
832        }
833
834        if dir_counts.is_empty() {
835            return "unknown".to_string();
836        }
837
838        let mut sorted: Vec<_> = dir_counts.into_iter().collect();
839        sorted.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(b.0)));
840
841        if sorted.len() == 1 {
842            sorted[0].0.to_string()
843        } else if sorted.len() <= 2 {
844            format!("{}+{}", sorted[0].0, sorted[1].0)
845        } else {
846            let extra = sorted.len() - 2;
847            format!("{}+{} +{extra} more", sorted[0].0, sorted[1].0)
848        }
849    }
850
851    /// Return node-to-community-ID mapping for Louvain.
852    pub fn louvain_with_assignment(&self, resolution: f64) -> HashMap<String, usize> {
853        let communities = self.louvain_communities(resolution);
854        let mut assignment = HashMap::new();
855        for (idx, community) in communities.into_iter().enumerate() {
856            for node_id in community {
857                assignment.insert(node_id, idx);
858            }
859        }
860        assignment
861    }
862}
863
864#[cfg(test)]
865#[path = "../tests/graph_algorithms_tests.rs"]
866mod tests;