Skip to main content

cognee_cognify/graph_integration/
deduplication.rs

1//! Graph deduplication utilities.
2//!
3//! Mirrors Python's `cognee/modules/graph/utils/deduplicate_nodes_and_edges.py`
4//! Provides in-memory deduplication of nodes and edges using HashMaps.
5
6use std::collections::HashMap;
7
8use crate::graph_integration::types::{GraphEdgePair, GraphNodePair};
9
10/// Result of deduplication operation.
11#[derive(Debug, Clone)]
12pub struct DeduplicationResult {
13    /// Unique nodes after deduplication
14    pub unique_nodes: Vec<GraphNodePair>,
15
16    /// Unique edges after deduplication
17    pub unique_edges: Vec<GraphEdgePair>,
18}
19
20/// Deduplicate nodes and edges in-memory.
21///
22/// **Node deduplication key**: `str(entity.id)` (entity UUID as string)
23/// **Edge deduplication key**: `"{source_id}_{target_id}_{relationship_name}"`
24///
25/// # Arguments
26/// * `nodes` - Vector of GraphNodePair to deduplicate
27/// * `edges` - Vector of GraphEdgePair to deduplicate
28///
29/// # Returns
30/// DeduplicationResult with unique nodes and edges.
31pub fn deduplicate_nodes_and_edges(
32    nodes: Vec<GraphNodePair>,
33    edges: Vec<GraphEdgePair>,
34) -> DeduplicationResult {
35    // Deduplicate nodes by entity ID
36    let mut node_map = HashMap::new();
37    for node in nodes {
38        let key = node.entity.base.id;
39        // Later entries with same ID will overwrite earlier ones
40        node_map.insert(key, node);
41    }
42
43    // Deduplicate edges by (source, target, relationship) tuple
44    let mut edge_map = HashMap::new();
45    for edge in edges {
46        let key = edge.dedup_key();
47        // Later entries with same key will overwrite earlier ones
48        edge_map.insert(key, edge);
49    }
50
51    DeduplicationResult {
52        unique_nodes: node_map.into_values().collect(),
53        unique_edges: edge_map.into_values().collect(),
54    }
55}
56
57#[cfg(test)]
58mod tests {
59    use super::*;
60    use cognee_models::{Entity, EntityType};
61    use uuid::Uuid;
62
63    fn create_test_node(name: &str, type_name: &str) -> GraphNodePair {
64        let entity = Entity::new(name, None, format!("{name} description"), None);
65        let entity_type = EntityType::from_node_type(type_name, None);
66        GraphNodePair {
67            entity,
68            entity_type,
69        }
70    }
71
72    fn create_test_edge(source_id: Uuid, target_id: Uuid, relationship: &str) -> GraphEdgePair {
73        GraphEdgePair::new(source_id, target_id, relationship)
74    }
75
76    #[test]
77    fn test_deduplicate_nodes_removes_duplicates() {
78        let node1 = create_test_node("TechCorp", "Organization");
79        let node2 = create_test_node("TechCorp", "Organization");
80
81        // Force same entity ID to create duplicate
82        let mut node2_clone = node2.clone();
83        node2_clone.entity.base.id = node1.entity.base.id;
84
85        let result = deduplicate_nodes_and_edges(vec![node1.clone(), node2_clone], vec![]);
86
87        assert_eq!(result.unique_nodes.len(), 1);
88        assert_eq!(result.unique_nodes[0].entity.name, "TechCorp");
89    }
90
91    #[test]
92    fn test_deduplicate_edges_removes_duplicates() {
93        let source_id = Uuid::new_v4();
94        let target_id = Uuid::new_v4();
95
96        let edge1 = create_test_edge(source_id, target_id, "works_at");
97        let edge2 = create_test_edge(source_id, target_id, "works_at");
98
99        let result = deduplicate_nodes_and_edges(vec![], vec![edge1, edge2]);
100
101        assert_eq!(result.unique_edges.len(), 1);
102        assert_eq!(result.unique_edges[0].relationship_name, "works_at");
103    }
104
105    #[test]
106    fn test_deduplicate_preserves_unique_nodes() {
107        let node1 = create_test_node("TechCorp", "Organization");
108        let node2 = create_test_node("Alice", "Person");
109        let node3 = create_test_node("London", "Location");
110
111        let result = deduplicate_nodes_and_edges(vec![node1, node2, node3], vec![]);
112
113        assert_eq!(result.unique_nodes.len(), 3);
114
115        // Verify all unique nodes are present (order might differ due to HashMap)
116        let names: Vec<String> = result
117            .unique_nodes
118            .iter()
119            .map(|n| n.entity.name.clone())
120            .collect();
121        assert!(names.contains(&"TechCorp".to_string()));
122        assert!(names.contains(&"Alice".to_string()));
123        assert!(names.contains(&"London".to_string()));
124    }
125
126    #[test]
127    fn test_deduplicate_preserves_unique_edges() {
128        let source_id = Uuid::new_v4();
129        let target_id1 = Uuid::new_v4();
130        let target_id2 = Uuid::new_v4();
131
132        let edge1 = create_test_edge(source_id, target_id1, "works_at");
133        let edge2 = create_test_edge(source_id, target_id2, "located_in");
134
135        let result = deduplicate_nodes_and_edges(vec![], vec![edge1, edge2]);
136
137        assert_eq!(result.unique_edges.len(), 2);
138    }
139
140    #[test]
141    fn test_deduplicate_different_relationships_same_entities() {
142        let source_id = Uuid::new_v4();
143        let target_id = Uuid::new_v4();
144
145        // Same source and target, different relationships
146        let edge1 = create_test_edge(source_id, target_id, "works_at");
147        let edge2 = create_test_edge(source_id, target_id, "founded");
148
149        let result = deduplicate_nodes_and_edges(vec![], vec![edge1, edge2]);
150
151        // Should keep both edges (different relationships)
152        assert_eq!(result.unique_edges.len(), 2);
153    }
154
155    #[test]
156    fn test_deduplicate_empty_input() {
157        let result = deduplicate_nodes_and_edges(vec![], vec![]);
158
159        assert_eq!(result.unique_nodes.len(), 0);
160        assert_eq!(result.unique_edges.len(), 0);
161    }
162
163    #[test]
164    fn test_deduplicate_later_entry_overwrites() {
165        let source_id = Uuid::new_v4();
166        let target_id = Uuid::new_v4();
167
168        // Create two edges with same key but different properties
169        let mut edge1 = create_test_edge(source_id, target_id, "works_at");
170        edge1.add_property("since", "2020");
171
172        let mut edge2 = create_test_edge(source_id, target_id, "works_at");
173        edge2.add_property("since", "2021");
174
175        let result = deduplicate_nodes_and_edges(vec![], vec![edge1, edge2]);
176
177        assert_eq!(result.unique_edges.len(), 1);
178        // Later entry (edge2) should win
179        assert_eq!(
180            result.unique_edges[0].properties.get("since"),
181            Some(&"2021".to_string())
182        );
183    }
184
185    #[test]
186    fn test_deduplicate_mixed_unique_and_duplicate() {
187        let node1 = create_test_node("TechCorp", "Organization");
188        let node2 = create_test_node("Alice", "Person");
189        let node3_dup = create_test_node("Alice", "Person");
190
191        // Force node3 to have same ID as node2
192        let mut node3 = node3_dup.clone();
193        node3.entity.base.id = node2.entity.base.id;
194
195        let result = deduplicate_nodes_and_edges(vec![node1, node2.clone(), node3], vec![]);
196
197        // Should have 2 unique nodes (TechCorp and Alice)
198        assert_eq!(result.unique_nodes.len(), 2);
199    }
200}