1use std::collections::{HashMap, HashSet};
7
8use tracing::debug;
9
10use graphify_core::graph::KnowledgeGraph;
11use graphify_core::model::{BridgeNode, DependencyCycle, GodNode, PageRankNode, Surprise};
12
13pub fn god_nodes(graph: &KnowledgeGraph, top_n: usize) -> Vec<GodNode> {
19 let generic_labels = ["lib", "mod", "main", "index", "init"];
20
21 let mut candidates: Vec<GodNode> = graph
22 .node_ids()
23 .into_iter()
24 .filter(|id| !is_file_node(graph, id) && !is_method_stub(graph, id))
25 .map(|id| {
26 let node = graph.get_node(&id).unwrap();
27 let label = if generic_labels.contains(&node.label.as_str()) {
28 disambiguate_label(&node.label, &node.source_file)
29 } else {
30 node.label.clone()
31 };
32 GodNode {
33 id: id.clone(),
34 label,
35 degree: graph.degree(&id),
36 community: node.community,
37 }
38 })
39 .collect();
40
41 candidates.sort_by_key(|b| std::cmp::Reverse(b.degree));
42 candidates.truncate(top_n);
43 debug!("found {} god node candidates", candidates.len());
44 candidates
45}
46
47fn disambiguate_label(label: &str, source_file: &str) -> String {
55 let parts: Vec<&str> = source_file.split('/').collect();
56 for (i, &segment) in parts.iter().enumerate() {
57 if segment == "src" && i > 0 {
58 return format!("{}::{}", parts[i - 1], label);
59 }
60 }
61 if parts.len() >= 2 {
62 return format!("{}::{}", parts[parts.len() - 2], label);
63 }
64 label.to_string()
65}
66
67pub fn surprising_connections(
76 graph: &KnowledgeGraph,
77 communities: &HashMap<usize, Vec<String>>,
78 top_n: usize,
79) -> Vec<Surprise> {
80 let node_to_community: HashMap<&str, usize> = communities
81 .iter()
82 .flat_map(|(&cid, nodes)| nodes.iter().map(move |n| (n.as_str(), cid)))
83 .collect();
84
85 let mut surprises: Vec<(f64, Surprise)> = Vec::new();
86
87 for (src, tgt, edge) in graph.edges_with_endpoints() {
88 if is_file_node(graph, src) || is_file_node(graph, tgt) {
89 continue;
90 }
91 if is_method_stub(graph, src) || is_method_stub(graph, tgt) {
92 continue;
93 }
94
95 let src_comm = node_to_community.get(src).copied().unwrap_or(usize::MAX);
96 let tgt_comm = node_to_community.get(tgt).copied().unwrap_or(usize::MAX);
97
98 let mut score = 0.0;
99
100 if src_comm != tgt_comm {
101 score += 2.0;
102 }
103
104 let src_node = graph.get_node(src);
105 let tgt_node = graph.get_node(tgt);
106 if let (Some(sn), Some(tn)) = (src_node, tgt_node)
107 && !sn.source_file.is_empty()
108 && !tn.source_file.is_empty()
109 && sn.source_file != tn.source_file
110 {
111 score += 1.0;
112 }
113
114 use graphify_core::confidence::Confidence;
115 match edge.confidence {
116 Confidence::Ambiguous => score += 3.0,
117 Confidence::Inferred => score += 1.5,
118 Confidence::Extracted => {}
119 }
120
121 if score > 0.0 {
122 surprises.push((
123 score,
124 Surprise {
125 source: src.to_string(),
126 target: tgt.to_string(),
127 source_community: src_comm,
128 target_community: tgt_comm,
129 relation: edge.relation.clone(),
130 },
131 ));
132 }
133 }
134
135 surprises.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
136 surprises.truncate(top_n);
137 debug!("found {} surprising connections", surprises.len());
138 surprises.into_iter().map(|(_, s)| s).collect()
139}
140
141pub fn suggest_questions(
150 graph: &KnowledgeGraph,
151 communities: &HashMap<usize, Vec<String>>,
152 community_labels: &HashMap<usize, String>,
153 top_n: usize,
154) -> Vec<HashMap<String, String>> {
155 let mut questions: Vec<HashMap<String, String>> = Vec::new();
156
157 {
158 use graphify_core::confidence::Confidence;
159 for (src, tgt, edge) in graph.edges_with_endpoints() {
160 if edge.confidence == Confidence::Ambiguous {
161 let mut q = HashMap::new();
162 q.insert("category".into(), "ambiguous_relationship".into());
163 q.insert(
164 "question".into(),
165 format!(
166 "What is the exact relationship between '{}' and '{}'? (marked as {})",
167 src, tgt, edge.relation
168 ),
169 );
170 q.insert("source".into(), src.to_string());
171 q.insert("target".into(), tgt.to_string());
172 questions.push(q);
173 }
174 }
175 }
176
177 {
179 let node_to_comm: HashMap<&str, usize> = communities
180 .iter()
181 .flat_map(|(&cid, nodes)| nodes.iter().map(move |n| (n.as_str(), cid)))
182 .collect();
183
184 for id in graph.node_ids() {
185 if is_file_node(graph, &id) {
186 continue;
187 }
188 let nbrs = graph.get_neighbors(&id);
189 let nbr_comms: HashSet<usize> = nbrs
190 .iter()
191 .filter_map(|n| node_to_comm.get(n.id.as_str()).copied())
192 .collect();
193 if nbr_comms.len() >= 3 {
194 let comm_names: Vec<String> = nbr_comms
195 .iter()
196 .filter_map(|c| community_labels.get(c).cloned())
197 .collect();
198 let mut q = HashMap::new();
199 q.insert("category".into(), "bridge_node".into());
200 q.insert(
201 "question".into(),
202 format!(
203 "How does '{}' relate to {} different communities{}?",
204 id,
205 nbr_comms.len(),
206 if comm_names.is_empty() {
207 String::new()
208 } else {
209 format!(" ({})", comm_names.join(", "))
210 }
211 ),
212 );
213 q.insert("node".into(), id.clone());
214 questions.push(q);
215 }
216 }
217 }
218
219 {
220 use graphify_core::confidence::Confidence;
221 let gods = god_nodes(graph, 5);
222 for g in &gods {
223 let has_inferred = graph.edges_with_endpoints().iter().any(|(s, t, e)| {
224 (*s == g.id || *t == g.id) && e.confidence == Confidence::Inferred
225 });
226 if has_inferred {
227 let mut q = HashMap::new();
228 q.insert("category".into(), "verification".into());
229 q.insert(
230 "question".into(),
231 format!(
232 "Can you verify the inferred relationships of '{}' (degree {})?",
233 g.label, g.degree
234 ),
235 );
236 q.insert("node".into(), g.id.clone());
237 questions.push(q);
238 }
239 }
240 }
241
242 {
243 for id in graph.node_ids() {
244 if graph.degree(&id) == 0
245 && !is_file_node(graph, &id)
246 && let Some(node) = graph.get_node(&id)
247 {
248 let mut q = HashMap::new();
249 q.insert("category".into(), "isolated_node".into());
250 q.insert(
251 "question".into(),
252 format!(
253 "What role does '{}' play? It has no connections in the graph.",
254 node.label
255 ),
256 );
257 q.insert("node".into(), id.clone());
258 questions.push(q);
259 }
260 }
261 }
262
263 {
264 for (&cid, nodes) in communities {
265 let n = nodes.len();
266 if n <= 1 {
267 continue;
268 }
269 let cohesion = compute_cohesion(graph, nodes);
270 if cohesion < 0.3 {
271 let label = community_labels
272 .get(&cid)
273 .cloned()
274 .unwrap_or_else(|| format!("community-{cid}"));
275 let mut q = HashMap::new();
276 q.insert("category".into(), "low_cohesion".into());
277 q.insert(
278 "question".into(),
279 format!(
280 "Why is '{label}' ({n} nodes) loosely connected (cohesion {cohesion:.2})? Should it be split?"
281 ),
282 );
283 q.insert("community".into(), cid.to_string());
284 questions.push(q);
285 }
286 }
287 }
288
289 questions.truncate(top_n);
290 debug!("generated {} questions", questions.len());
291 questions
292}
293
294pub fn graph_diff(
296 old: &KnowledgeGraph,
297 new: &KnowledgeGraph,
298) -> HashMap<String, serde_json::Value> {
299 let old_node_ids: HashSet<String> = old.node_ids().into_iter().collect();
300 let new_node_ids: HashSet<String> = new.node_ids().into_iter().collect();
301
302 let added_nodes: Vec<&String> = new_node_ids.difference(&old_node_ids).collect();
303 let removed_nodes: Vec<&String> = old_node_ids.difference(&new_node_ids).collect();
304
305 let old_edge_keys: HashSet<(String, String, String)> = old
306 .edges_with_endpoints()
307 .iter()
308 .map(|(s, t, e)| (s.to_string(), t.to_string(), e.relation.clone()))
309 .collect();
310 let new_edge_keys: HashSet<(String, String, String)> = new
311 .edges_with_endpoints()
312 .iter()
313 .map(|(s, t, e)| (s.to_string(), t.to_string(), e.relation.clone()))
314 .collect();
315
316 let added_edges: Vec<&(String, String, String)> =
317 new_edge_keys.difference(&old_edge_keys).collect();
318 let removed_edges: Vec<&(String, String, String)> =
319 old_edge_keys.difference(&new_edge_keys).collect();
320
321 let mut result = HashMap::new();
322 result.insert("added_nodes".into(), serde_json::json!(added_nodes));
323 result.insert("removed_nodes".into(), serde_json::json!(removed_nodes));
324 result.insert(
325 "added_edges".into(),
326 serde_json::json!(
327 added_edges
328 .iter()
329 .map(|(s, t, r)| { serde_json::json!({"source": s, "target": t, "relation": r}) })
330 .collect::<Vec<_>>()
331 ),
332 );
333 result.insert(
334 "removed_edges".into(),
335 serde_json::json!(
336 removed_edges
337 .iter()
338 .map(|(s, t, r)| { serde_json::json!({"source": s, "target": t, "relation": r}) })
339 .collect::<Vec<_>>()
340 ),
341 );
342 result.insert(
343 "summary".into(),
344 serde_json::json!({
345 "nodes_added": added_nodes.len(),
346 "nodes_removed": removed_nodes.len(),
347 "edges_added": added_edges.len(),
348 "edges_removed": removed_edges.len(),
349 "old_node_count": old.node_count(),
350 "new_node_count": new.node_count(),
351 "old_edge_count": old.edge_count(),
352 "new_edge_count": new.edge_count(),
353 }),
354 );
355
356 result
357}
358
359pub fn community_bridges(
364 graph: &KnowledgeGraph,
365 communities: &HashMap<usize, Vec<String>>,
366 top_n: usize,
367) -> Vec<BridgeNode> {
368 let node_to_community: HashMap<&str, usize> = communities
369 .iter()
370 .flat_map(|(&cid, nodes)| nodes.iter().map(move |n| (n.as_str(), cid)))
371 .collect();
372
373 let mut bridges: Vec<BridgeNode> = graph
374 .node_ids()
375 .into_iter()
376 .filter(|id| !is_file_node(graph, id))
377 .filter_map(|id| {
378 let node = graph.get_node(&id)?;
379 let my_comm = node_to_community.get(id.as_str()).copied()?;
380 let neighbors = graph.neighbor_ids(&id);
381 let total = neighbors.len();
382 if total == 0 {
383 return None;
384 }
385
386 let mut touched: HashSet<usize> = HashSet::new();
387 touched.insert(my_comm);
388 let mut cross = 0usize;
389 for nid in &neighbors {
390 let neighbor_comm = node_to_community
391 .get(nid.as_str())
392 .copied()
393 .unwrap_or(my_comm);
394 if neighbor_comm != my_comm {
395 cross += 1;
396 touched.insert(neighbor_comm);
397 }
398 }
399
400 if cross == 0 {
401 return None;
402 }
403
404 let ratio = cross as f64 / total as f64;
405 let mut communities_touched: Vec<usize> = touched.into_iter().collect();
406 communities_touched.sort_unstable();
407
408 Some(BridgeNode {
409 id: id.clone(),
410 label: node.label.clone(),
411 total_edges: total,
412 cross_community_edges: cross,
413 bridge_ratio: ratio,
414 communities_touched,
415 })
416 })
417 .collect();
418
419 bridges.sort_by(|a, b| {
420 b.bridge_ratio
421 .partial_cmp(&a.bridge_ratio)
422 .unwrap_or(std::cmp::Ordering::Equal)
423 });
424 bridges.truncate(top_n);
425 bridges
426}
427
428fn is_file_node(graph: &KnowledgeGraph, node_id: &str) -> bool {
430 if let Some(node) = graph.get_node(node_id)
431 && !node.source_file.is_empty()
432 && let Some(fname) = std::path::Path::new(&node.source_file).file_name()
433 && node.label == fname.to_string_lossy()
434 {
435 return true;
436 }
437 false
438}
439
440fn is_method_stub(graph: &KnowledgeGraph, node_id: &str) -> bool {
442 if let Some(node) = graph.get_node(node_id) {
443 if node.label.starts_with('.') && node.label.ends_with("()") {
444 return true;
445 }
446 if node.label.ends_with("()") && graph.degree(node_id) <= 1 {
447 return true;
448 }
449 }
450 false
451}
452
453fn compute_cohesion(graph: &KnowledgeGraph, community_nodes: &[String]) -> f64 {
455 let n = community_nodes.len();
456 if n <= 1 {
457 return 1.0;
458 }
459 let node_set: HashSet<&str> = community_nodes
460 .iter()
461 .map(std::string::String::as_str)
462 .collect();
463 let mut actual_edges = 0usize;
464 for node_id in community_nodes {
465 for neighbor in graph.get_neighbors(node_id) {
466 if node_set.contains(neighbor.id.as_str()) {
467 actual_edges += 1;
468 }
469 }
470 }
471 actual_edges /= 2;
472 let possible = n * (n - 1) / 2;
473 if possible == 0 {
474 return 0.0;
475 }
476 actual_edges as f64 / possible as f64
477}
478
479pub fn pagerank(
484 graph: &KnowledgeGraph,
485 top_n: usize,
486 damping: f64,
487 max_iterations: usize,
488) -> Vec<PageRankNode> {
489 let n = graph.node_count();
490 if n == 0 {
491 return Vec::new();
492 }
493
494 let ids = graph.node_ids();
495 let id_to_idx: HashMap<&str, usize> = ids
496 .iter()
497 .enumerate()
498 .map(|(i, s)| (s.as_str(), i))
499 .collect();
500
501 let mut adj: Vec<Vec<usize>> = vec![Vec::new(); n];
502 for (src, tgt, _) in graph.edges_with_endpoints() {
503 if let (Some(&si), Some(&ti)) = (id_to_idx.get(src), id_to_idx.get(tgt)) {
504 adj[si].push(ti);
505 adj[ti].push(si);
506 }
507 }
508
509 let out_degree: Vec<usize> = adj.iter().map(std::vec::Vec::len).collect();
510 let init = 1.0 / n as f64;
511 let mut rank = vec![init; n];
512 let mut next_rank = vec![0.0f64; n];
513
514 for _ in 0..max_iterations {
515 let teleport = (1.0 - damping) / n as f64;
516
517 let dangling_sum: f64 = rank
518 .iter()
519 .enumerate()
520 .filter(|(i, _)| out_degree[*i] == 0)
521 .map(|(_, r)| r)
522 .sum();
523
524 for v in 0..n {
525 let mut sum = 0.0;
526 for &u in &adj[v] {
527 if out_degree[u] > 0 {
528 sum += rank[u] / out_degree[u] as f64;
529 }
530 }
531 next_rank[v] = teleport + damping * (sum + dangling_sum / n as f64);
532 }
533
534 let delta: f64 = rank
535 .iter()
536 .zip(next_rank.iter())
537 .map(|(a, b)| (a - b).abs())
538 .sum();
539 std::mem::swap(&mut rank, &mut next_rank);
540 if delta < 1e-6 {
541 break;
542 }
543 }
544
545 let mut results: Vec<PageRankNode> = ids
546 .iter()
547 .enumerate()
548 .map(|(i, id)| {
549 let node = graph.get_node(id);
550 PageRankNode {
551 id: id.clone(),
552 label: node.map(|n| n.label.clone()).unwrap_or_default(),
553 score: rank[i],
554 degree: out_degree[i],
555 }
556 })
557 .collect();
558
559 results.sort_by(|a, b| {
560 b.score
561 .partial_cmp(&a.score)
562 .unwrap_or(std::cmp::Ordering::Equal)
563 });
564 results.truncate(top_n);
565 results
566}
567
568pub fn detect_cycles(graph: &KnowledgeGraph, max_cycles: usize) -> Vec<DependencyCycle> {
573 let directional = ["imports", "uses", "calls", "includes"];
574
575 let ids = graph.node_ids();
576 let id_to_idx: HashMap<&str, usize> = ids
577 .iter()
578 .enumerate()
579 .map(|(i, s)| (s.as_str(), i))
580 .collect();
581 let n = ids.len();
582
583 let mut adj: Vec<Vec<usize>> = vec![Vec::new(); n];
584 for (src, tgt, edge) in graph.edges_with_endpoints() {
585 if directional.contains(&edge.relation.as_str())
586 && let (Some(&si), Some(&ti)) = (id_to_idx.get(src), id_to_idx.get(tgt))
587 {
588 adj[si].push(ti);
589 }
590 }
591
592 let sccs = tarjan_scc(&adj, n);
593
594 let mut cycles: Vec<DependencyCycle> = Vec::new();
595 for scc in &sccs {
596 if scc.len() <= 1 || cycles.len() >= max_cycles {
597 continue;
598 }
599
600 let scc_set: HashSet<usize> = scc.iter().copied().collect();
601 if let Some(cycle_indices) = find_cycle_in_scc(&adj, scc, &scc_set) {
602 let nodes: Vec<String> = cycle_indices.iter().map(|&i| ids[i].clone()).collect();
603 let edges: Vec<(String, String)> = cycle_indices
604 .windows(2)
605 .map(|w| (ids[w[0]].clone(), ids[w[1]].clone()))
606 .chain(std::iter::once((
607 ids[*cycle_indices.last().unwrap()].clone(),
608 ids[cycle_indices[0]].clone(),
609 )))
610 .collect();
611 let severity = 1.0 / nodes.len() as f64;
612 cycles.push(DependencyCycle {
613 nodes,
614 edges,
615 severity,
616 });
617 }
618 }
619
620 cycles.sort_by(|a, b| {
621 b.severity
622 .partial_cmp(&a.severity)
623 .unwrap_or(std::cmp::Ordering::Equal)
624 });
625 cycles.truncate(max_cycles);
626 cycles
627}
628
629fn tarjan_scc(adj: &[Vec<usize>], n: usize) -> Vec<Vec<usize>> {
631 let mut index_counter = 0usize;
632 let mut stack: Vec<usize> = Vec::new();
633 let mut on_stack = vec![false; n];
634 let mut index = vec![usize::MAX; n];
635 let mut lowlink = vec![0usize; n];
636 let mut result: Vec<Vec<usize>> = Vec::new();
637
638 for start in 0..n {
639 if index[start] != usize::MAX {
640 continue;
641 }
642
643 let mut call_stack: Vec<(usize, usize)> = Vec::new();
644
645 index[start] = index_counter;
646 lowlink[start] = index_counter;
647 index_counter += 1;
648 stack.push(start);
649 on_stack[start] = true;
650 call_stack.push((start, 0));
651
652 while let Some((v, mut ni)) = call_stack.pop() {
653 if ni < adj[v].len() {
654 let w = adj[v][ni];
655 ni += 1;
656 call_stack.push((v, ni));
657
658 if index[w] == usize::MAX {
659 index[w] = index_counter;
660 lowlink[w] = index_counter;
661 index_counter += 1;
662 stack.push(w);
663 on_stack[w] = true;
664 call_stack.push((w, 0));
665 } else if on_stack[w] {
666 lowlink[v] = lowlink[v].min(index[w]);
667 }
668 } else {
669 if lowlink[v] == index[v] {
670 let mut component = Vec::new();
671 while let Some(w) = stack.pop() {
672 on_stack[w] = false;
673 component.push(w);
674 if w == v {
675 break;
676 }
677 }
678 result.push(component);
679 }
680
681 if let Some(&(parent, _)) = call_stack.last() {
682 lowlink[parent] = lowlink[parent].min(lowlink[v]);
683 }
684 }
685 }
686 }
687
688 result
689}
690
691fn find_cycle_in_scc(
693 adj: &[Vec<usize>],
694 scc: &[usize],
695 scc_set: &HashSet<usize>,
696) -> Option<Vec<usize>> {
697 if scc.is_empty() {
698 return None;
699 }
700 let start = scc[0];
701 let mut visited = HashSet::new();
702 let mut path = Vec::new();
703
704 let mut dfs_stack: Vec<(usize, usize)> = vec![(start, 0)];
705 path.push(start);
706 visited.insert(start);
707
708 while let Some((node, ni)) = dfs_stack.last_mut() {
709 if *ni < adj[*node].len() {
710 let next = adj[*node][*ni];
711 *ni += 1;
712
713 if !scc_set.contains(&next) {
714 continue;
715 }
716 if next == start && path.len() > 1 {
717 return Some(path.clone());
718 }
719 if !visited.contains(&next) {
720 visited.insert(next);
721 path.push(next);
722 dfs_stack.push((next, 0));
723 }
724 } else {
725 dfs_stack.pop();
726 path.pop();
727 }
728 }
729
730 None
731}
732
733pub mod embedding;
734pub mod temporal;
735
736#[cfg(test)]
737mod tests;