Skip to main content

graphify_extract/
dedup.rs

1//! Deduplication of extracted nodes and edges.
2//!
3//! After per-file extraction, duplicate nodes (same ID) and edges (same
4//! source + target + relation triple) are removed to produce a clean graph.
5
6use std::collections::HashSet;
7
8use graphify_core::model::ExtractionResult;
9
10/// Deduplicate nodes within a single file's [`ExtractionResult`].
11///
12/// Retains the **first** occurrence of each node ID and each
13/// `(source, target, relation)` edge triple.
14pub fn dedup_file(result: &mut ExtractionResult) {
15    let mut seen_nodes = HashSet::new();
16    result.nodes.retain(|n| seen_nodes.insert(n.id.clone()));
17
18    let mut seen_edges: HashSet<(String, String, String)> = HashSet::new();
19    result
20        .edges
21        .retain(|e| seen_edges.insert((e.source.clone(), e.target.clone(), e.relation.clone())));
22}
23
24/// Merge multiple [`ExtractionResult`]s into one, deduplicating across all of them.
25pub fn dedup_results(results: Vec<ExtractionResult>) -> ExtractionResult {
26    let mut combined = ExtractionResult::default();
27    for r in results {
28        combined.nodes.extend(r.nodes);
29        combined.edges.extend(r.edges);
30        combined.hyperedges.extend(r.hyperedges);
31    }
32    dedup_file(&mut combined);
33    combined
34}
35
36#[cfg(test)]
37mod tests {
38    use super::*;
39    use graphify_core::confidence::Confidence;
40    use graphify_core::model::{GraphEdge, GraphNode, NodeType};
41    use std::collections::HashMap;
42
43    fn node(id: &str) -> GraphNode {
44        GraphNode {
45            id: id.to_string(),
46            label: id.to_string(),
47            source_file: "test.rs".to_string(),
48            source_location: None,
49            node_type: NodeType::Function,
50            community: None,
51            extra: HashMap::new(),
52        }
53    }
54
55    fn edge(src: &str, tgt: &str, rel: &str) -> GraphEdge {
56        GraphEdge {
57            source: src.to_string(),
58            target: tgt.to_string(),
59            relation: rel.to_string(),
60            confidence: Confidence::Extracted,
61            confidence_score: 1.0,
62            source_file: "test.rs".to_string(),
63            source_location: None,
64            weight: 1.0,
65            provenance: None,
66            extra: HashMap::new(),
67        }
68    }
69
70    #[test]
71    fn dedup_removes_duplicate_nodes() {
72        let mut result = ExtractionResult {
73            nodes: vec![node("a"), node("b"), node("a"), node("c"), node("b")],
74            edges: Vec::new(),
75            hyperedges: Vec::new(),
76        };
77        dedup_file(&mut result);
78        assert_eq!(result.nodes.len(), 3);
79        let ids: Vec<&str> = result.nodes.iter().map(|n| n.id.as_str()).collect();
80        assert_eq!(ids, vec!["a", "b", "c"]);
81    }
82
83    #[test]
84    fn dedup_removes_duplicate_edges() {
85        let mut result = ExtractionResult {
86            nodes: Vec::new(),
87            edges: vec![
88                edge("a", "b", "calls"),
89                edge("a", "b", "calls"),   // duplicate
90                edge("a", "b", "imports"), // different relation — keep
91                edge("c", "d", "calls"),
92            ],
93            hyperedges: Vec::new(),
94        };
95        dedup_file(&mut result);
96        assert_eq!(result.edges.len(), 3);
97    }
98
99    #[test]
100    fn dedup_preserves_first_occurrence() {
101        let mut n1 = node("x");
102        n1.label = "first".to_string();
103        let mut n2 = node("x");
104        n2.label = "second".to_string();
105
106        let mut result = ExtractionResult {
107            nodes: vec![n1, n2],
108            edges: Vec::new(),
109            hyperedges: Vec::new(),
110        };
111        dedup_file(&mut result);
112        assert_eq!(result.nodes.len(), 1);
113        assert_eq!(result.nodes[0].label, "first");
114    }
115
116    #[test]
117    fn dedup_no_duplicates_is_noop() {
118        let mut result = ExtractionResult {
119            nodes: vec![node("a"), node("b")],
120            edges: vec![edge("a", "b", "calls")],
121            hyperedges: Vec::new(),
122        };
123        dedup_file(&mut result);
124        assert_eq!(result.nodes.len(), 2);
125        assert_eq!(result.edges.len(), 1);
126    }
127
128    #[test]
129    fn dedup_empty_is_noop() {
130        let mut result = ExtractionResult::default();
131        dedup_file(&mut result);
132        assert!(result.nodes.is_empty());
133        assert!(result.edges.is_empty());
134    }
135
136    #[test]
137    fn dedup_results_merges_and_deduplicates() {
138        let r1 = ExtractionResult {
139            nodes: vec![node("a"), node("b")],
140            edges: vec![edge("a", "b", "calls")],
141            hyperedges: Vec::new(),
142        };
143        let r2 = ExtractionResult {
144            nodes: vec![node("b"), node("c")],
145            edges: vec![edge("a", "b", "calls"), edge("b", "c", "imports")],
146            hyperedges: Vec::new(),
147        };
148        let merged = dedup_results(vec![r1, r2]);
149        assert_eq!(merged.nodes.len(), 3);
150        assert_eq!(merged.edges.len(), 2);
151    }
152}