Skip to main content

graphify_extract/
dedup.rs

1//! Deduplication of extracted nodes and edges.
2//!
3//! After per-file extraction, duplicate nodes (same ID) and edges (same
4//! source + target + relation triple) are removed to produce a clean graph.
5
6use std::collections::HashSet;
7
8use graphify_core::model::ExtractionResult;
9
10/// Deduplicate nodes within a single file's [`ExtractionResult`].
11///
12/// Retains the **first** occurrence of each node ID and each
13/// `(source, target, relation)` edge triple.
14pub fn dedup_file(result: &mut ExtractionResult) {
15    // Dedup nodes by ID
16    let mut seen_nodes = HashSet::new();
17    result.nodes.retain(|n| seen_nodes.insert(n.id.clone()));
18
19    // Dedup edges by (source, target, relation) triple
20    let mut seen_edges: HashSet<(String, String, String)> = HashSet::new();
21    result
22        .edges
23        .retain(|e| seen_edges.insert((e.source.clone(), e.target.clone(), e.relation.clone())));
24}
25
26/// Merge multiple [`ExtractionResult`]s into one, deduplicating across all of them.
27pub fn dedup_results(results: Vec<ExtractionResult>) -> ExtractionResult {
28    let mut combined = ExtractionResult::default();
29    for r in results {
30        combined.nodes.extend(r.nodes);
31        combined.edges.extend(r.edges);
32        combined.hyperedges.extend(r.hyperedges);
33    }
34    dedup_file(&mut combined);
35    combined
36}
37
38// ---------------------------------------------------------------------------
39// Tests
40// ---------------------------------------------------------------------------
41
42#[cfg(test)]
43mod tests {
44    use super::*;
45    use graphify_core::confidence::Confidence;
46    use graphify_core::model::{GraphEdge, GraphNode, NodeType};
47    use std::collections::HashMap;
48
49    fn node(id: &str) -> GraphNode {
50        GraphNode {
51            id: id.to_string(),
52            label: id.to_string(),
53            source_file: "test.rs".to_string(),
54            source_location: None,
55            node_type: NodeType::Function,
56            community: None,
57            extra: HashMap::new(),
58        }
59    }
60
61    fn edge(src: &str, tgt: &str, rel: &str) -> GraphEdge {
62        GraphEdge {
63            source: src.to_string(),
64            target: tgt.to_string(),
65            relation: rel.to_string(),
66            confidence: Confidence::Extracted,
67            confidence_score: 1.0,
68            source_file: "test.rs".to_string(),
69            source_location: None,
70            weight: 1.0,
71            extra: HashMap::new(),
72        }
73    }
74
75    #[test]
76    fn dedup_removes_duplicate_nodes() {
77        let mut result = ExtractionResult {
78            nodes: vec![node("a"), node("b"), node("a"), node("c"), node("b")],
79            edges: Vec::new(),
80            hyperedges: Vec::new(),
81        };
82        dedup_file(&mut result);
83        assert_eq!(result.nodes.len(), 3);
84        let ids: Vec<&str> = result.nodes.iter().map(|n| n.id.as_str()).collect();
85        assert_eq!(ids, vec!["a", "b", "c"]);
86    }
87
88    #[test]
89    fn dedup_removes_duplicate_edges() {
90        let mut result = ExtractionResult {
91            nodes: Vec::new(),
92            edges: vec![
93                edge("a", "b", "calls"),
94                edge("a", "b", "calls"),   // duplicate
95                edge("a", "b", "imports"), // different relation — keep
96                edge("c", "d", "calls"),
97            ],
98            hyperedges: Vec::new(),
99        };
100        dedup_file(&mut result);
101        assert_eq!(result.edges.len(), 3);
102    }
103
104    #[test]
105    fn dedup_preserves_first_occurrence() {
106        let mut n1 = node("x");
107        n1.label = "first".to_string();
108        let mut n2 = node("x");
109        n2.label = "second".to_string();
110
111        let mut result = ExtractionResult {
112            nodes: vec![n1, n2],
113            edges: Vec::new(),
114            hyperedges: Vec::new(),
115        };
116        dedup_file(&mut result);
117        assert_eq!(result.nodes.len(), 1);
118        assert_eq!(result.nodes[0].label, "first");
119    }
120
121    #[test]
122    fn dedup_no_duplicates_is_noop() {
123        let mut result = ExtractionResult {
124            nodes: vec![node("a"), node("b")],
125            edges: vec![edge("a", "b", "calls")],
126            hyperedges: Vec::new(),
127        };
128        dedup_file(&mut result);
129        assert_eq!(result.nodes.len(), 2);
130        assert_eq!(result.edges.len(), 1);
131    }
132
133    #[test]
134    fn dedup_empty_is_noop() {
135        let mut result = ExtractionResult::default();
136        dedup_file(&mut result);
137        assert!(result.nodes.is_empty());
138        assert!(result.edges.is_empty());
139    }
140
141    #[test]
142    fn dedup_results_merges_and_deduplicates() {
143        let r1 = ExtractionResult {
144            nodes: vec![node("a"), node("b")],
145            edges: vec![edge("a", "b", "calls")],
146            hyperedges: Vec::new(),
147        };
148        let r2 = ExtractionResult {
149            nodes: vec![node("b"), node("c")],
150            edges: vec![edge("a", "b", "calls"), edge("b", "c", "imports")],
151            hyperedges: Vec::new(),
152        };
153        let merged = dedup_results(vec![r1, r2]);
154        assert_eq!(merged.nodes.len(), 3);
155        assert_eq!(merged.edges.len(), 2);
156    }
157}