graphify_extract/
dedup.rs1use std::collections::HashSet;
7
8use graphify_core::model::ExtractionResult;
9
10pub fn dedup_file(result: &mut ExtractionResult) {
15 let mut seen_nodes = HashSet::new();
16 result.nodes.retain(|n| seen_nodes.insert(n.id.clone()));
17
18 let mut seen_edges: HashSet<(String, String, String)> = HashSet::new();
19 result
20 .edges
21 .retain(|e| seen_edges.insert((e.source.clone(), e.target.clone(), e.relation.clone())));
22}
23
24pub fn dedup_results(results: Vec<ExtractionResult>) -> ExtractionResult {
26 let mut combined = ExtractionResult::default();
27 for r in results {
28 combined.nodes.extend(r.nodes);
29 combined.edges.extend(r.edges);
30 combined.hyperedges.extend(r.hyperedges);
31 }
32 dedup_file(&mut combined);
33 combined
34}
35
36#[cfg(test)]
37mod tests {
38 use super::*;
39 use graphify_core::confidence::Confidence;
40 use graphify_core::model::{GraphEdge, GraphNode, NodeType};
41 use std::collections::HashMap;
42
43 fn node(id: &str) -> GraphNode {
44 GraphNode {
45 id: id.to_string(),
46 label: id.to_string(),
47 source_file: "test.rs".to_string(),
48 source_location: None,
49 node_type: NodeType::Function,
50 community: None,
51 extra: HashMap::new(),
52 }
53 }
54
55 fn edge(src: &str, tgt: &str, rel: &str) -> GraphEdge {
56 GraphEdge {
57 source: src.to_string(),
58 target: tgt.to_string(),
59 relation: rel.to_string(),
60 confidence: Confidence::Extracted,
61 confidence_score: 1.0,
62 source_file: "test.rs".to_string(),
63 source_location: None,
64 weight: 1.0,
65 provenance: None,
66 extra: HashMap::new(),
67 }
68 }
69
70 #[test]
71 fn dedup_removes_duplicate_nodes() {
72 let mut result = ExtractionResult {
73 nodes: vec![node("a"), node("b"), node("a"), node("c"), node("b")],
74 edges: Vec::new(),
75 hyperedges: Vec::new(),
76 };
77 dedup_file(&mut result);
78 assert_eq!(result.nodes.len(), 3);
79 let ids: Vec<&str> = result.nodes.iter().map(|n| n.id.as_str()).collect();
80 assert_eq!(ids, vec!["a", "b", "c"]);
81 }
82
83 #[test]
84 fn dedup_removes_duplicate_edges() {
85 let mut result = ExtractionResult {
86 nodes: Vec::new(),
87 edges: vec![
88 edge("a", "b", "calls"),
89 edge("a", "b", "calls"), edge("a", "b", "imports"), edge("c", "d", "calls"),
92 ],
93 hyperedges: Vec::new(),
94 };
95 dedup_file(&mut result);
96 assert_eq!(result.edges.len(), 3);
97 }
98
99 #[test]
100 fn dedup_preserves_first_occurrence() {
101 let mut n1 = node("x");
102 n1.label = "first".to_string();
103 let mut n2 = node("x");
104 n2.label = "second".to_string();
105
106 let mut result = ExtractionResult {
107 nodes: vec![n1, n2],
108 edges: Vec::new(),
109 hyperedges: Vec::new(),
110 };
111 dedup_file(&mut result);
112 assert_eq!(result.nodes.len(), 1);
113 assert_eq!(result.nodes[0].label, "first");
114 }
115
116 #[test]
117 fn dedup_no_duplicates_is_noop() {
118 let mut result = ExtractionResult {
119 nodes: vec![node("a"), node("b")],
120 edges: vec![edge("a", "b", "calls")],
121 hyperedges: Vec::new(),
122 };
123 dedup_file(&mut result);
124 assert_eq!(result.nodes.len(), 2);
125 assert_eq!(result.edges.len(), 1);
126 }
127
128 #[test]
129 fn dedup_empty_is_noop() {
130 let mut result = ExtractionResult::default();
131 dedup_file(&mut result);
132 assert!(result.nodes.is_empty());
133 assert!(result.edges.is_empty());
134 }
135
136 #[test]
137 fn dedup_results_merges_and_deduplicates() {
138 let r1 = ExtractionResult {
139 nodes: vec![node("a"), node("b")],
140 edges: vec![edge("a", "b", "calls")],
141 hyperedges: Vec::new(),
142 };
143 let r2 = ExtractionResult {
144 nodes: vec![node("b"), node("c")],
145 edges: vec![edge("a", "b", "calls"), edge("b", "c", "imports")],
146 hyperedges: Vec::new(),
147 };
148 let merged = dedup_results(vec![r1, r2]);
149 assert_eq!(merged.nodes.len(), 3);
150 assert_eq!(merged.edges.len(), 2);
151 }
152}