Skip to main content

graphify_extract/
dedup.rs

1//! Deduplication of extracted nodes and edges.
2//!
3//! After per-file extraction, duplicate nodes (same ID) and edges (same
4//! source + target + relation triple) are removed to produce a clean graph.
5
6use std::collections::HashSet;
7
8use graphify_core::model::ExtractionResult;
9
10/// Deduplicate nodes within a single file's [`ExtractionResult`].
11///
12/// Retains the **first** occurrence of each node ID and each
13/// `(source, target, relation)` edge triple.
14pub fn dedup_file(result: &mut ExtractionResult) {
15    let mut seen_nodes = HashSet::new();
16    result.nodes.retain(|n| seen_nodes.insert(n.id.clone()));
17
18    let mut seen_edges: HashSet<(String, String, String)> = HashSet::new();
19    result
20        .edges
21        .retain(|e| seen_edges.insert((e.source.clone(), e.target.clone(), e.relation.clone())));
22}
23
24/// Merge multiple [`ExtractionResult`]s into one, deduplicating across all of them.
25pub fn dedup_results(results: Vec<ExtractionResult>) -> ExtractionResult {
26    let mut combined = ExtractionResult::default();
27    for r in results {
28        combined.nodes.extend(r.nodes);
29        combined.edges.extend(r.edges);
30        combined.hyperedges.extend(r.hyperedges);
31    }
32    dedup_file(&mut combined);
33    combined
34}
35
36#[cfg(test)]
37mod tests {
38    use super::*;
39    use graphify_core::confidence::Confidence;
40    use graphify_core::model::{GraphEdge, GraphNode, NodeType};
41    use std::collections::HashMap;
42
43    fn node(id: &str) -> GraphNode {
44        GraphNode {
45            id: id.to_string(),
46            label: id.to_string(),
47            source_file: "test.rs".to_string(),
48            source_location: None,
49            node_type: NodeType::Function,
50            community: None,
51            extra: HashMap::new(),
52        }
53    }
54
55    fn edge(src: &str, tgt: &str, rel: &str) -> GraphEdge {
56        GraphEdge {
57            source: src.to_string(),
58            target: tgt.to_string(),
59            relation: rel.to_string(),
60            confidence: Confidence::Extracted,
61            confidence_score: 1.0,
62            source_file: "test.rs".to_string(),
63            source_location: None,
64            weight: 1.0,
65            extra: HashMap::new(),
66        }
67    }
68
69    #[test]
70    fn dedup_removes_duplicate_nodes() {
71        let mut result = ExtractionResult {
72            nodes: vec![node("a"), node("b"), node("a"), node("c"), node("b")],
73            edges: Vec::new(),
74            hyperedges: Vec::new(),
75        };
76        dedup_file(&mut result);
77        assert_eq!(result.nodes.len(), 3);
78        let ids: Vec<&str> = result.nodes.iter().map(|n| n.id.as_str()).collect();
79        assert_eq!(ids, vec!["a", "b", "c"]);
80    }
81
82    #[test]
83    fn dedup_removes_duplicate_edges() {
84        let mut result = ExtractionResult {
85            nodes: Vec::new(),
86            edges: vec![
87                edge("a", "b", "calls"),
88                edge("a", "b", "calls"),   // duplicate
89                edge("a", "b", "imports"), // different relation — keep
90                edge("c", "d", "calls"),
91            ],
92            hyperedges: Vec::new(),
93        };
94        dedup_file(&mut result);
95        assert_eq!(result.edges.len(), 3);
96    }
97
98    #[test]
99    fn dedup_preserves_first_occurrence() {
100        let mut n1 = node("x");
101        n1.label = "first".to_string();
102        let mut n2 = node("x");
103        n2.label = "second".to_string();
104
105        let mut result = ExtractionResult {
106            nodes: vec![n1, n2],
107            edges: Vec::new(),
108            hyperedges: Vec::new(),
109        };
110        dedup_file(&mut result);
111        assert_eq!(result.nodes.len(), 1);
112        assert_eq!(result.nodes[0].label, "first");
113    }
114
115    #[test]
116    fn dedup_no_duplicates_is_noop() {
117        let mut result = ExtractionResult {
118            nodes: vec![node("a"), node("b")],
119            edges: vec![edge("a", "b", "calls")],
120            hyperedges: Vec::new(),
121        };
122        dedup_file(&mut result);
123        assert_eq!(result.nodes.len(), 2);
124        assert_eq!(result.edges.len(), 1);
125    }
126
127    #[test]
128    fn dedup_empty_is_noop() {
129        let mut result = ExtractionResult::default();
130        dedup_file(&mut result);
131        assert!(result.nodes.is_empty());
132        assert!(result.edges.is_empty());
133    }
134
135    #[test]
136    fn dedup_results_merges_and_deduplicates() {
137        let r1 = ExtractionResult {
138            nodes: vec![node("a"), node("b")],
139            edges: vec![edge("a", "b", "calls")],
140            hyperedges: Vec::new(),
141        };
142        let r2 = ExtractionResult {
143            nodes: vec![node("b"), node("c")],
144            edges: vec![edge("a", "b", "calls"), edge("b", "c", "imports")],
145            hyperedges: Vec::new(),
146        };
147        let merged = dedup_results(vec![r1, r2]);
148        assert_eq!(merged.nodes.len(), 3);
149        assert_eq!(merged.edges.len(), 2);
150    }
151}