Skip to main content

graphify_analyze/
lib.rs

1//! Graph analysis algorithms for graphify.
2//!
3//! Identifies god nodes, surprising cross-community connections, and generates
4//! suggested questions for exploration.
5
6use std::collections::{HashMap, HashSet};
7
8use tracing::debug;
9
10use graphify_core::graph::KnowledgeGraph;
11use graphify_core::model::{BridgeNode, DependencyCycle, GodNode, PageRankNode, Surprise};
12
13/// Find the most-connected nodes, excluding file-level hubs and method stubs.
14///
15/// Returns up to `top_n` nodes sorted by degree descending.
16/// Generic labels like "lib", "mod", "main" are disambiguated with the crate/module
17/// name extracted from `source_file`.
18pub fn god_nodes(graph: &KnowledgeGraph, top_n: usize) -> Vec<GodNode> {
19    let generic_labels = ["lib", "mod", "main", "index", "init"];
20
21    let mut candidates: Vec<GodNode> = graph
22        .node_ids()
23        .into_iter()
24        .filter(|id| !is_file_node(graph, id) && !is_method_stub(graph, id))
25        .map(|id| {
26            let node = graph.get_node(&id).unwrap();
27            let label = if generic_labels.contains(&node.label.as_str()) {
28                disambiguate_label(&node.label, &node.source_file)
29            } else {
30                node.label.clone()
31            };
32            GodNode {
33                id: id.clone(),
34                label,
35                degree: graph.degree(&id),
36                community: node.community,
37            }
38        })
39        .collect();
40
41    candidates.sort_by_key(|b| std::cmp::Reverse(b.degree));
42    candidates.truncate(top_n);
43    debug!("found {} god node candidates", candidates.len());
44    candidates
45}
46
47/// Disambiguate a generic label using the source file path.
48///
49/// Extracts the parent directory or crate name to create a unique label.
50/// Examples:
51/// - ("lib", "crates/graphify-export/src/lib.rs") → "graphify-export::lib"
52/// - ("mod", "src/config.rs") → "src::mod"
53/// - ("lib", "src/lib.rs") → "lib"
54fn disambiguate_label(label: &str, source_file: &str) -> String {
55    let parts: Vec<&str> = source_file.split('/').collect();
56    for (i, &segment) in parts.iter().enumerate() {
57        if segment == "src" && i > 0 {
58            return format!("{}::{}", parts[i - 1], label);
59        }
60    }
61    if parts.len() >= 2 {
62        return format!("{}::{}", parts[parts.len() - 2], label);
63    }
64    label.to_string()
65}
66
67/// Find surprising connections that span different communities or source files.
68///
69/// A connection is "surprising" if:
70/// - the two endpoints belong to different communities, or
71/// - the two endpoints come from different source files, or
72/// - the edge confidence is `AMBIGUOUS` or `INFERRED`.
73///
74/// Results are scored and the top `top_n` are returned.
75pub fn surprising_connections(
76    graph: &KnowledgeGraph,
77    communities: &HashMap<usize, Vec<String>>,
78    top_n: usize,
79) -> Vec<Surprise> {
80    let node_to_community: HashMap<&str, usize> = communities
81        .iter()
82        .flat_map(|(&cid, nodes)| nodes.iter().map(move |n| (n.as_str(), cid)))
83        .collect();
84
85    let mut surprises: Vec<(f64, Surprise)> = Vec::new();
86
87    for (src, tgt, edge) in graph.edges_with_endpoints() {
88        if is_file_node(graph, src) || is_file_node(graph, tgt) {
89            continue;
90        }
91        if is_method_stub(graph, src) || is_method_stub(graph, tgt) {
92            continue;
93        }
94
95        let src_comm = node_to_community.get(src).copied().unwrap_or(usize::MAX);
96        let tgt_comm = node_to_community.get(tgt).copied().unwrap_or(usize::MAX);
97
98        let mut score = 0.0;
99
100        if src_comm != tgt_comm {
101            score += 2.0;
102        }
103
104        let src_node = graph.get_node(src);
105        let tgt_node = graph.get_node(tgt);
106        if let (Some(sn), Some(tn)) = (src_node, tgt_node)
107            && !sn.source_file.is_empty()
108            && !tn.source_file.is_empty()
109            && sn.source_file != tn.source_file
110        {
111            score += 1.0;
112        }
113
114        use graphify_core::confidence::Confidence;
115        match edge.confidence {
116            Confidence::Ambiguous => score += 3.0,
117            Confidence::Inferred => score += 1.5,
118            Confidence::Extracted => {}
119        }
120
121        if score > 0.0 {
122            surprises.push((
123                score,
124                Surprise {
125                    source: src.to_string(),
126                    target: tgt.to_string(),
127                    source_community: src_comm,
128                    target_community: tgt_comm,
129                    relation: edge.relation.clone(),
130                },
131            ));
132        }
133    }
134
135    surprises.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
136    surprises.truncate(top_n);
137    debug!("found {} surprising connections", surprises.len());
138    surprises.into_iter().map(|(_, s)| s).collect()
139}
140
141/// Generate graph-aware questions based on structural patterns.
142///
143/// Categories:
144/// 1. AMBIGUOUS edges → unresolved relationship questions
145/// 2. Bridge nodes (high cross-community degree) → cross-cutting concern questions
146/// 3. God nodes with INFERRED edges → verification questions
147/// 4. Isolated nodes → exploration questions
148/// 5. Low-cohesion communities → structural questions
149pub fn suggest_questions(
150    graph: &KnowledgeGraph,
151    communities: &HashMap<usize, Vec<String>>,
152    community_labels: &HashMap<usize, String>,
153    top_n: usize,
154) -> Vec<HashMap<String, String>> {
155    let mut questions: Vec<HashMap<String, String>> = Vec::new();
156
157    {
158        use graphify_core::confidence::Confidence;
159        for (src, tgt, edge) in graph.edges_with_endpoints() {
160            if edge.confidence == Confidence::Ambiguous {
161                let mut q = HashMap::new();
162                q.insert("category".into(), "ambiguous_relationship".into());
163                q.insert(
164                    "question".into(),
165                    format!(
166                        "What is the exact relationship between '{}' and '{}'? (marked as {})",
167                        src, tgt, edge.relation
168                    ),
169                );
170                q.insert("source".into(), src.to_string());
171                q.insert("target".into(), tgt.to_string());
172                questions.push(q);
173            }
174        }
175    }
176
177    // 2. Bridge nodes (nodes with neighbours in multiple communities)
178    {
179        let node_to_comm: HashMap<&str, usize> = communities
180            .iter()
181            .flat_map(|(&cid, nodes)| nodes.iter().map(move |n| (n.as_str(), cid)))
182            .collect();
183
184        for id in graph.node_ids() {
185            if is_file_node(graph, &id) {
186                continue;
187            }
188            let nbrs = graph.get_neighbors(&id);
189            let nbr_comms: HashSet<usize> = nbrs
190                .iter()
191                .filter_map(|n| node_to_comm.get(n.id.as_str()).copied())
192                .collect();
193            if nbr_comms.len() >= 3 {
194                let comm_names: Vec<String> = nbr_comms
195                    .iter()
196                    .filter_map(|c| community_labels.get(c).cloned())
197                    .collect();
198                let mut q = HashMap::new();
199                q.insert("category".into(), "bridge_node".into());
200                q.insert(
201                    "question".into(),
202                    format!(
203                        "How does '{}' relate to {} different communities{}?",
204                        id,
205                        nbr_comms.len(),
206                        if comm_names.is_empty() {
207                            String::new()
208                        } else {
209                            format!(" ({})", comm_names.join(", "))
210                        }
211                    ),
212                );
213                q.insert("node".into(), id.clone());
214                questions.push(q);
215            }
216        }
217    }
218
219    {
220        use graphify_core::confidence::Confidence;
221        let gods = god_nodes(graph, 5);
222        for g in &gods {
223            let has_inferred = graph.edges_with_endpoints().iter().any(|(s, t, e)| {
224                (*s == g.id || *t == g.id) && e.confidence == Confidence::Inferred
225            });
226            if has_inferred {
227                let mut q = HashMap::new();
228                q.insert("category".into(), "verification".into());
229                q.insert(
230                    "question".into(),
231                    format!(
232                        "Can you verify the inferred relationships of '{}' (degree {})?",
233                        g.label, g.degree
234                    ),
235                );
236                q.insert("node".into(), g.id.clone());
237                questions.push(q);
238            }
239        }
240    }
241
242    {
243        for id in graph.node_ids() {
244            if graph.degree(&id) == 0
245                && !is_file_node(graph, &id)
246                && let Some(node) = graph.get_node(&id)
247            {
248                let mut q = HashMap::new();
249                q.insert("category".into(), "isolated_node".into());
250                q.insert(
251                    "question".into(),
252                    format!(
253                        "What role does '{}' play? It has no connections in the graph.",
254                        node.label
255                    ),
256                );
257                q.insert("node".into(), id.clone());
258                questions.push(q);
259            }
260        }
261    }
262
263    {
264        for (&cid, nodes) in communities {
265            let n = nodes.len();
266            if n <= 1 {
267                continue;
268            }
269            let cohesion = compute_cohesion(graph, nodes);
270            if cohesion < 0.3 {
271                let label = community_labels
272                    .get(&cid)
273                    .cloned()
274                    .unwrap_or_else(|| format!("community-{cid}"));
275                let mut q = HashMap::new();
276                q.insert("category".into(), "low_cohesion".into());
277                q.insert(
278                    "question".into(),
279                    format!(
280                        "Why is '{label}' ({n} nodes) loosely connected (cohesion {cohesion:.2})? Should it be split?"
281                    ),
282                );
283                q.insert("community".into(), cid.to_string());
284                questions.push(q);
285            }
286        }
287    }
288
289    questions.truncate(top_n);
290    debug!("generated {} questions", questions.len());
291    questions
292}
293
294/// Compare two graph snapshots and return a summary of changes.
295pub fn graph_diff(
296    old: &KnowledgeGraph,
297    new: &KnowledgeGraph,
298) -> HashMap<String, serde_json::Value> {
299    let old_node_ids: HashSet<String> = old.node_ids().into_iter().collect();
300    let new_node_ids: HashSet<String> = new.node_ids().into_iter().collect();
301
302    let added_nodes: Vec<&String> = new_node_ids.difference(&old_node_ids).collect();
303    let removed_nodes: Vec<&String> = old_node_ids.difference(&new_node_ids).collect();
304
305    let old_edge_keys: HashSet<(String, String, String)> = old
306        .edges_with_endpoints()
307        .iter()
308        .map(|(s, t, e)| (s.to_string(), t.to_string(), e.relation.clone()))
309        .collect();
310    let new_edge_keys: HashSet<(String, String, String)> = new
311        .edges_with_endpoints()
312        .iter()
313        .map(|(s, t, e)| (s.to_string(), t.to_string(), e.relation.clone()))
314        .collect();
315
316    let added_edges: Vec<&(String, String, String)> =
317        new_edge_keys.difference(&old_edge_keys).collect();
318    let removed_edges: Vec<&(String, String, String)> =
319        old_edge_keys.difference(&new_edge_keys).collect();
320
321    let mut result = HashMap::new();
322    result.insert("added_nodes".into(), serde_json::json!(added_nodes));
323    result.insert("removed_nodes".into(), serde_json::json!(removed_nodes));
324    result.insert(
325        "added_edges".into(),
326        serde_json::json!(
327            added_edges
328                .iter()
329                .map(|(s, t, r)| { serde_json::json!({"source": s, "target": t, "relation": r}) })
330                .collect::<Vec<_>>()
331        ),
332    );
333    result.insert(
334        "removed_edges".into(),
335        serde_json::json!(
336            removed_edges
337                .iter()
338                .map(|(s, t, r)| { serde_json::json!({"source": s, "target": t, "relation": r}) })
339                .collect::<Vec<_>>()
340        ),
341    );
342    result.insert(
343        "summary".into(),
344        serde_json::json!({
345            "nodes_added": added_nodes.len(),
346            "nodes_removed": removed_nodes.len(),
347            "edges_added": added_edges.len(),
348            "edges_removed": removed_edges.len(),
349            "old_node_count": old.node_count(),
350            "new_node_count": new.node_count(),
351            "old_edge_count": old.edge_count(),
352            "new_edge_count": new.edge_count(),
353        }),
354    );
355
356    result
357}
358
359/// Find nodes that bridge multiple communities.
360///
361/// A bridge node has a high ratio of cross-community edges to total edges.
362/// Returns up to `top_n` nodes sorted by bridge ratio descending.
363pub fn community_bridges(
364    graph: &KnowledgeGraph,
365    communities: &HashMap<usize, Vec<String>>,
366    top_n: usize,
367) -> Vec<BridgeNode> {
368    let node_to_community: HashMap<&str, usize> = communities
369        .iter()
370        .flat_map(|(&cid, nodes)| nodes.iter().map(move |n| (n.as_str(), cid)))
371        .collect();
372
373    let mut bridges: Vec<BridgeNode> = graph
374        .node_ids()
375        .into_iter()
376        .filter(|id| !is_file_node(graph, id))
377        .filter_map(|id| {
378            let node = graph.get_node(&id)?;
379            let my_comm = node_to_community.get(id.as_str()).copied()?;
380            let neighbors = graph.neighbor_ids(&id);
381            let total = neighbors.len();
382            if total == 0 {
383                return None;
384            }
385
386            let mut touched: HashSet<usize> = HashSet::new();
387            touched.insert(my_comm);
388            let mut cross = 0usize;
389            for nid in &neighbors {
390                let neighbor_comm = node_to_community
391                    .get(nid.as_str())
392                    .copied()
393                    .unwrap_or(my_comm);
394                if neighbor_comm != my_comm {
395                    cross += 1;
396                    touched.insert(neighbor_comm);
397                }
398            }
399
400            if cross == 0 {
401                return None;
402            }
403
404            let ratio = cross as f64 / total as f64;
405            let mut communities_touched: Vec<usize> = touched.into_iter().collect();
406            communities_touched.sort_unstable();
407
408            Some(BridgeNode {
409                id: id.clone(),
410                label: node.label.clone(),
411                total_edges: total,
412                cross_community_edges: cross,
413                bridge_ratio: ratio,
414                communities_touched,
415            })
416        })
417        .collect();
418
419    bridges.sort_by(|a, b| {
420        b.bridge_ratio
421            .partial_cmp(&a.bridge_ratio)
422            .unwrap_or(std::cmp::Ordering::Equal)
423    });
424    bridges.truncate(top_n);
425    bridges
426}
427
428/// Is this a file-level hub node?
429fn is_file_node(graph: &KnowledgeGraph, node_id: &str) -> bool {
430    if let Some(node) = graph.get_node(node_id)
431        && !node.source_file.is_empty()
432        && let Some(fname) = std::path::Path::new(&node.source_file).file_name()
433        && node.label == fname.to_string_lossy()
434    {
435        return true;
436    }
437    false
438}
439
440/// Is this a method stub (.method_name() or isolated fn()?
441fn is_method_stub(graph: &KnowledgeGraph, node_id: &str) -> bool {
442    if let Some(node) = graph.get_node(node_id) {
443        if node.label.starts_with('.') && node.label.ends_with("()") {
444            return true;
445        }
446        if node.label.ends_with("()") && graph.degree(node_id) <= 1 {
447            return true;
448        }
449    }
450    false
451}
452
453/// Compute cohesion for a set of nodes (inline helper).
454fn compute_cohesion(graph: &KnowledgeGraph, community_nodes: &[String]) -> f64 {
455    let n = community_nodes.len();
456    if n <= 1 {
457        return 1.0;
458    }
459    let node_set: HashSet<&str> = community_nodes
460        .iter()
461        .map(std::string::String::as_str)
462        .collect();
463    let mut actual_edges = 0usize;
464    for node_id in community_nodes {
465        for neighbor in graph.get_neighbors(node_id) {
466            if node_set.contains(neighbor.id.as_str()) {
467                actual_edges += 1;
468            }
469        }
470    }
471    actual_edges /= 2;
472    let possible = n * (n - 1) / 2;
473    if possible == 0 {
474        return 0.0;
475    }
476    actual_edges as f64 / possible as f64
477}
478
479/// Compute PageRank importance scores for all nodes.
480///
481/// Returns the top `top_n` nodes sorted by PageRank score descending.
482/// Uses the power iteration method with configurable damping factor (default 0.85).
483pub fn pagerank(
484    graph: &KnowledgeGraph,
485    top_n: usize,
486    damping: f64,
487    max_iterations: usize,
488) -> Vec<PageRankNode> {
489    let n = graph.node_count();
490    if n == 0 {
491        return Vec::new();
492    }
493
494    let ids = graph.node_ids();
495    let id_to_idx: HashMap<&str, usize> = ids
496        .iter()
497        .enumerate()
498        .map(|(i, s)| (s.as_str(), i))
499        .collect();
500
501    let mut adj: Vec<Vec<usize>> = vec![Vec::new(); n];
502    for (src, tgt, _) in graph.edges_with_endpoints() {
503        if let (Some(&si), Some(&ti)) = (id_to_idx.get(src), id_to_idx.get(tgt)) {
504            adj[si].push(ti);
505            adj[ti].push(si);
506        }
507    }
508
509    let out_degree: Vec<usize> = adj.iter().map(std::vec::Vec::len).collect();
510    let init = 1.0 / n as f64;
511    let mut rank = vec![init; n];
512    let mut next_rank = vec![0.0f64; n];
513
514    for _ in 0..max_iterations {
515        let teleport = (1.0 - damping) / n as f64;
516
517        let dangling_sum: f64 = rank
518            .iter()
519            .enumerate()
520            .filter(|(i, _)| out_degree[*i] == 0)
521            .map(|(_, r)| r)
522            .sum();
523
524        for v in 0..n {
525            let mut sum = 0.0;
526            for &u in &adj[v] {
527                if out_degree[u] > 0 {
528                    sum += rank[u] / out_degree[u] as f64;
529                }
530            }
531            next_rank[v] = teleport + damping * (sum + dangling_sum / n as f64);
532        }
533
534        let delta: f64 = rank
535            .iter()
536            .zip(next_rank.iter())
537            .map(|(a, b)| (a - b).abs())
538            .sum();
539        std::mem::swap(&mut rank, &mut next_rank);
540        if delta < 1e-6 {
541            break;
542        }
543    }
544
545    let mut results: Vec<PageRankNode> = ids
546        .iter()
547        .enumerate()
548        .map(|(i, id)| {
549            let node = graph.get_node(id);
550            PageRankNode {
551                id: id.clone(),
552                label: node.map(|n| n.label.clone()).unwrap_or_default(),
553                score: rank[i],
554                degree: out_degree[i],
555            }
556        })
557        .collect();
558
559    results.sort_by(|a, b| {
560        b.score
561            .partial_cmp(&a.score)
562            .unwrap_or(std::cmp::Ordering::Equal)
563    });
564    results.truncate(top_n);
565    results
566}
567
568/// Detect dependency cycles using Tarjan's algorithm for strongly connected components.
569///
570/// Only considers directional edges (imports, uses, calls) to find true dependency cycles.
571/// Returns cycles sorted by severity (shorter cycles = more severe).
572pub fn detect_cycles(graph: &KnowledgeGraph, max_cycles: usize) -> Vec<DependencyCycle> {
573    let directional = ["imports", "uses", "calls", "includes"];
574
575    let ids = graph.node_ids();
576    let id_to_idx: HashMap<&str, usize> = ids
577        .iter()
578        .enumerate()
579        .map(|(i, s)| (s.as_str(), i))
580        .collect();
581    let n = ids.len();
582
583    let mut adj: Vec<Vec<usize>> = vec![Vec::new(); n];
584    for (src, tgt, edge) in graph.edges_with_endpoints() {
585        if directional.contains(&edge.relation.as_str())
586            && let (Some(&si), Some(&ti)) = (id_to_idx.get(src), id_to_idx.get(tgt))
587        {
588            adj[si].push(ti);
589        }
590    }
591
592    let sccs = tarjan_scc(&adj, n);
593
594    let mut cycles: Vec<DependencyCycle> = Vec::new();
595    for scc in &sccs {
596        if scc.len() <= 1 || cycles.len() >= max_cycles {
597            continue;
598        }
599
600        let scc_set: HashSet<usize> = scc.iter().copied().collect();
601        if let Some(cycle_indices) = find_cycle_in_scc(&adj, scc, &scc_set) {
602            let nodes: Vec<String> = cycle_indices.iter().map(|&i| ids[i].clone()).collect();
603            let edges: Vec<(String, String)> = cycle_indices
604                .windows(2)
605                .map(|w| (ids[w[0]].clone(), ids[w[1]].clone()))
606                .chain(std::iter::once((
607                    ids[*cycle_indices.last().unwrap()].clone(),
608                    ids[cycle_indices[0]].clone(),
609                )))
610                .collect();
611            let severity = 1.0 / nodes.len() as f64;
612            cycles.push(DependencyCycle {
613                nodes,
614                edges,
615                severity,
616            });
617        }
618    }
619
620    cycles.sort_by(|a, b| {
621        b.severity
622            .partial_cmp(&a.severity)
623            .unwrap_or(std::cmp::Ordering::Equal)
624    });
625    cycles.truncate(max_cycles);
626    cycles
627}
628
629/// Iterative Tarjan's algorithm for finding strongly connected components.
630fn tarjan_scc(adj: &[Vec<usize>], n: usize) -> Vec<Vec<usize>> {
631    let mut index_counter = 0usize;
632    let mut stack: Vec<usize> = Vec::new();
633    let mut on_stack = vec![false; n];
634    let mut index = vec![usize::MAX; n];
635    let mut lowlink = vec![0usize; n];
636    let mut result: Vec<Vec<usize>> = Vec::new();
637
638    for start in 0..n {
639        if index[start] != usize::MAX {
640            continue;
641        }
642
643        let mut call_stack: Vec<(usize, usize)> = Vec::new();
644
645        index[start] = index_counter;
646        lowlink[start] = index_counter;
647        index_counter += 1;
648        stack.push(start);
649        on_stack[start] = true;
650        call_stack.push((start, 0));
651
652        while let Some((v, mut ni)) = call_stack.pop() {
653            if ni < adj[v].len() {
654                let w = adj[v][ni];
655                ni += 1;
656                call_stack.push((v, ni));
657
658                if index[w] == usize::MAX {
659                    index[w] = index_counter;
660                    lowlink[w] = index_counter;
661                    index_counter += 1;
662                    stack.push(w);
663                    on_stack[w] = true;
664                    call_stack.push((w, 0));
665                } else if on_stack[w] {
666                    lowlink[v] = lowlink[v].min(index[w]);
667                }
668            } else {
669                if lowlink[v] == index[v] {
670                    let mut component = Vec::new();
671                    while let Some(w) = stack.pop() {
672                        on_stack[w] = false;
673                        component.push(w);
674                        if w == v {
675                            break;
676                        }
677                    }
678                    result.push(component);
679                }
680
681                if let Some(&(parent, _)) = call_stack.last() {
682                    lowlink[parent] = lowlink[parent].min(lowlink[v]);
683                }
684            }
685        }
686    }
687
688    result
689}
690
691/// Find a simple cycle within a strongly connected component using iterative DFS.
692fn find_cycle_in_scc(
693    adj: &[Vec<usize>],
694    scc: &[usize],
695    scc_set: &HashSet<usize>,
696) -> Option<Vec<usize>> {
697    if scc.is_empty() {
698        return None;
699    }
700    let start = scc[0];
701    let mut visited = HashSet::new();
702    let mut path = Vec::new();
703
704    let mut dfs_stack: Vec<(usize, usize)> = vec![(start, 0)];
705    path.push(start);
706    visited.insert(start);
707
708    while let Some((node, ni)) = dfs_stack.last_mut() {
709        if *ni < adj[*node].len() {
710            let next = adj[*node][*ni];
711            *ni += 1;
712
713            if !scc_set.contains(&next) {
714                continue;
715            }
716            if next == start && path.len() > 1 {
717                return Some(path.clone());
718            }
719            if !visited.contains(&next) {
720                visited.insert(next);
721                path.push(next);
722                dfs_stack.push((next, 0));
723            }
724        } else {
725            dfs_stack.pop();
726            path.pop();
727        }
728    }
729
730    None
731}
732
733pub mod embedding;
734pub mod temporal;
735
736#[cfg(test)]
737mod tests;