Skip to main content

sem_core/model/
identity.rs

1use std::collections::{HashMap, HashSet};
2
3use super::change::{ChangeType, SemanticChange};
4use super::entity::SemanticEntity;
5
6pub struct MatchResult {
7    pub changes: Vec<SemanticChange>,
8}
9
10/// 3-phase entity matching algorithm:
11/// 1. Exact ID match — same entity ID in before/after → modified or unchanged
12/// 2. Content hash match — same hash, different ID → renamed or moved
13/// 3. Fuzzy similarity — >80% content similarity → probable rename
14pub fn match_entities(
15    before: &[SemanticEntity],
16    after: &[SemanticEntity],
17    _file_path: &str,
18    similarity_fn: Option<&dyn Fn(&SemanticEntity, &SemanticEntity) -> f64>,
19    commit_sha: Option<&str>,
20    author: Option<&str>,
21) -> MatchResult {
22    let mut changes: Vec<SemanticChange> = Vec::new();
23    let mut matched_before: HashSet<&str> = HashSet::new();
24    let mut matched_after: HashSet<&str> = HashSet::new();
25
26    let before_by_id: HashMap<&str, &SemanticEntity> =
27        before.iter().map(|e| (e.id.as_str(), e)).collect();
28    let after_by_id: HashMap<&str, &SemanticEntity> =
29        after.iter().map(|e| (e.id.as_str(), e)).collect();
30
31    // Phase 1: Exact ID match
32    for (&id, after_entity) in &after_by_id {
33        if let Some(before_entity) = before_by_id.get(id) {
34            matched_before.insert(id);
35            matched_after.insert(id);
36
37            if before_entity.content_hash != after_entity.content_hash {
38                let structural_change = match (&before_entity.structural_hash, &after_entity.structural_hash) {
39                    (Some(before_sh), Some(after_sh)) => Some(before_sh != after_sh),
40                    _ => None,
41                };
42                changes.push(SemanticChange {
43                    id: format!("change::{id}"),
44                    entity_id: id.to_string(),
45                    change_type: ChangeType::Modified,
46                    entity_type: after_entity.entity_type.clone(),
47                    entity_name: after_entity.name.clone(),
48                    file_path: after_entity.file_path.clone(),
49                    old_file_path: None,
50                    before_content: Some(before_entity.content.clone()),
51                    after_content: Some(after_entity.content.clone()),
52                    commit_sha: commit_sha.map(String::from),
53                    author: author.map(String::from),
54                    timestamp: None,
55                    structural_change,
56                });
57            }
58        }
59    }
60
61    // Collect unmatched
62    let unmatched_before: Vec<&SemanticEntity> = before
63        .iter()
64        .filter(|e| !matched_before.contains(e.id.as_str()))
65        .collect();
66    let unmatched_after: Vec<&SemanticEntity> = after
67        .iter()
68        .filter(|e| !matched_after.contains(e.id.as_str()))
69        .collect();
70
71    // Phase 2: Content hash match (rename/move detection)
72    let mut before_by_hash: HashMap<&str, Vec<&SemanticEntity>> = HashMap::new();
73    let mut before_by_structural: HashMap<&str, Vec<&SemanticEntity>> = HashMap::new();
74    for entity in &unmatched_before {
75        before_by_hash
76            .entry(entity.content_hash.as_str())
77            .or_default()
78            .push(entity);
79        if let Some(ref sh) = entity.structural_hash {
80            before_by_structural
81                .entry(sh.as_str())
82                .or_default()
83                .push(entity);
84        }
85    }
86
87    for after_entity in &unmatched_after {
88        if matched_after.contains(after_entity.id.as_str()) {
89            continue;
90        }
91        // Try exact content_hash first
92        let found = before_by_hash
93            .get_mut(after_entity.content_hash.as_str())
94            .and_then(|c| c.pop());
95        // Fall back to structural_hash (formatting/comment changes don't matter)
96        let found = found.or_else(|| {
97            after_entity.structural_hash.as_ref().and_then(|sh| {
98                before_by_structural.get_mut(sh.as_str()).and_then(|c| {
99                    c.iter()
100                        .position(|e| !matched_before.contains(e.id.as_str()))
101                        .map(|i| c.remove(i))
102                })
103            })
104        });
105
106        if let Some(before_entity) = found {
107            matched_before.insert(&before_entity.id);
108            matched_after.insert(&after_entity.id);
109
110            let change_type = if before_entity.file_path != after_entity.file_path {
111                ChangeType::Moved
112            } else {
113                ChangeType::Renamed
114            };
115
116            let old_file_path = if before_entity.file_path != after_entity.file_path {
117                Some(before_entity.file_path.clone())
118            } else {
119                None
120            };
121
122            changes.push(SemanticChange {
123                id: format!("change::{}", after_entity.id),
124                entity_id: after_entity.id.clone(),
125                change_type,
126                entity_type: after_entity.entity_type.clone(),
127                entity_name: after_entity.name.clone(),
128                file_path: after_entity.file_path.clone(),
129                old_file_path,
130                before_content: Some(before_entity.content.clone()),
131                after_content: Some(after_entity.content.clone()),
132                commit_sha: commit_sha.map(String::from),
133                author: author.map(String::from),
134                timestamp: None,
135                structural_change: None,
136            });
137        }
138    }
139
140    // Phase 3: Fuzzy similarity (>80% threshold)
141    let still_unmatched_before: Vec<&SemanticEntity> = unmatched_before
142        .iter()
143        .filter(|e| !matched_before.contains(e.id.as_str()))
144        .copied()
145        .collect();
146    let still_unmatched_after: Vec<&SemanticEntity> = unmatched_after
147        .iter()
148        .filter(|e| !matched_after.contains(e.id.as_str()))
149        .copied()
150        .collect();
151
152    if let Some(sim_fn) = similarity_fn {
153        if !still_unmatched_before.is_empty() && !still_unmatched_after.is_empty() {
154            const THRESHOLD: f64 = 0.8;
155            // Size ratio filter: pairs with very different content lengths can't reach 0.8 Jaccard
156            const SIZE_RATIO_CUTOFF: f64 = 0.5;
157
158            // Pre-compute content lengths for O(1) size filtering
159            let before_lens: Vec<usize> = still_unmatched_before
160                .iter()
161                .map(|e| e.content.split_whitespace().count())
162                .collect();
163            let after_lens: Vec<usize> = still_unmatched_after
164                .iter()
165                .map(|e| e.content.split_whitespace().count())
166                .collect();
167
168            for (ai, after_entity) in still_unmatched_after.iter().enumerate() {
169                let mut best_match: Option<&SemanticEntity> = None;
170                let mut best_score: f64 = 0.0;
171                let a_len = after_lens[ai];
172
173                for (bi, before_entity) in still_unmatched_before.iter().enumerate() {
174                    if matched_before.contains(before_entity.id.as_str()) {
175                        continue;
176                    }
177                    if before_entity.entity_type != after_entity.entity_type {
178                        continue;
179                    }
180
181                    // Early exit: skip pairs where token count ratio is too different
182                    let b_len = before_lens[bi];
183                    let (min_l, max_l) = if a_len < b_len { (a_len, b_len) } else { (b_len, a_len) };
184                    if max_l > 0 && (min_l as f64 / max_l as f64) < SIZE_RATIO_CUTOFF {
185                        continue;
186                    }
187
188                    let score = sim_fn(before_entity, after_entity);
189                    if score > best_score && score >= THRESHOLD {
190                        best_score = score;
191                        best_match = Some(before_entity);
192                    }
193                }
194
195                if let Some(matched) = best_match {
196                    matched_before.insert(&matched.id);
197                    matched_after.insert(&after_entity.id);
198
199                    let change_type = if matched.file_path != after_entity.file_path {
200                        ChangeType::Moved
201                    } else {
202                        ChangeType::Renamed
203                    };
204
205                    let old_file_path = if matched.file_path != after_entity.file_path {
206                        Some(matched.file_path.clone())
207                    } else {
208                        None
209                    };
210
211                    changes.push(SemanticChange {
212                        id: format!("change::{}", after_entity.id),
213                        entity_id: after_entity.id.clone(),
214                        change_type,
215                        entity_type: after_entity.entity_type.clone(),
216                        entity_name: after_entity.name.clone(),
217                        file_path: after_entity.file_path.clone(),
218                        old_file_path,
219                        before_content: Some(matched.content.clone()),
220                        after_content: Some(after_entity.content.clone()),
221                        commit_sha: commit_sha.map(String::from),
222                        author: author.map(String::from),
223                        timestamp: None,
224                        structural_change: None,
225                    });
226                }
227            }
228        }
229    }
230
231    // Remaining unmatched before = deleted
232    for entity in before.iter().filter(|e| !matched_before.contains(e.id.as_str())) {
233        changes.push(SemanticChange {
234            id: format!("change::deleted::{}", entity.id),
235            entity_id: entity.id.clone(),
236            change_type: ChangeType::Deleted,
237            entity_type: entity.entity_type.clone(),
238            entity_name: entity.name.clone(),
239            file_path: entity.file_path.clone(),
240            old_file_path: None,
241            before_content: Some(entity.content.clone()),
242            after_content: None,
243            commit_sha: commit_sha.map(String::from),
244            author: author.map(String::from),
245            timestamp: None,
246            structural_change: None,
247        });
248    }
249
250    // Remaining unmatched after = added
251    for entity in after.iter().filter(|e| !matched_after.contains(e.id.as_str())) {
252        changes.push(SemanticChange {
253            id: format!("change::added::{}", entity.id),
254            entity_id: entity.id.clone(),
255            change_type: ChangeType::Added,
256            entity_type: entity.entity_type.clone(),
257            entity_name: entity.name.clone(),
258            file_path: entity.file_path.clone(),
259            old_file_path: None,
260            before_content: None,
261            after_content: Some(entity.content.clone()),
262            commit_sha: commit_sha.map(String::from),
263            author: author.map(String::from),
264            timestamp: None,
265            structural_change: None,
266        });
267    }
268
269    MatchResult { changes }
270}
271
272/// Default content similarity using Jaccard index on whitespace-split tokens
273pub fn default_similarity(a: &SemanticEntity, b: &SemanticEntity) -> f64 {
274    let tokens_a: Vec<&str> = a.content.split_whitespace().collect();
275    let tokens_b: Vec<&str> = b.content.split_whitespace().collect();
276
277    // Early rejection: if token counts differ too much, Jaccard can't reach 0.8
278    let (min_c, max_c) = if tokens_a.len() < tokens_b.len() {
279        (tokens_a.len(), tokens_b.len())
280    } else {
281        (tokens_b.len(), tokens_a.len())
282    };
283    if max_c > 0 && (min_c as f64 / max_c as f64) < 0.6 {
284        return 0.0;
285    }
286
287    let set_a: HashSet<&str> = tokens_a.into_iter().collect();
288    let set_b: HashSet<&str> = tokens_b.into_iter().collect();
289
290    let intersection_size = set_a.intersection(&set_b).count();
291    let union_size = set_a.union(&set_b).count();
292
293    if union_size == 0 {
294        return 0.0;
295    }
296
297    intersection_size as f64 / union_size as f64
298}
299
300#[cfg(test)]
301mod tests {
302    use super::*;
303    use crate::utils::hash::content_hash;
304
305    fn make_entity(id: &str, name: &str, content: &str, file_path: &str) -> SemanticEntity {
306        SemanticEntity {
307            id: id.to_string(),
308            file_path: file_path.to_string(),
309            entity_type: "function".to_string(),
310            name: name.to_string(),
311            parent_id: None,
312            content: content.to_string(),
313            content_hash: content_hash(content),
314            structural_hash: None,
315            start_line: 1,
316            end_line: 1,
317            metadata: None,
318        }
319    }
320
321    #[test]
322    fn test_exact_match_modified() {
323        let before = vec![make_entity("a::f::foo", "foo", "old content", "a.ts")];
324        let after = vec![make_entity("a::f::foo", "foo", "new content", "a.ts")];
325        let result = match_entities(&before, &after, "a.ts", None, None, None);
326        assert_eq!(result.changes.len(), 1);
327        assert_eq!(result.changes[0].change_type, ChangeType::Modified);
328    }
329
330    #[test]
331    fn test_exact_match_unchanged() {
332        let before = vec![make_entity("a::f::foo", "foo", "same", "a.ts")];
333        let after = vec![make_entity("a::f::foo", "foo", "same", "a.ts")];
334        let result = match_entities(&before, &after, "a.ts", None, None, None);
335        assert_eq!(result.changes.len(), 0);
336    }
337
338    #[test]
339    fn test_added_deleted() {
340        let before = vec![make_entity("a::f::old", "old", "content", "a.ts")];
341        let after = vec![make_entity("a::f::new", "new", "different", "a.ts")];
342        let result = match_entities(&before, &after, "a.ts", None, None, None);
343        assert_eq!(result.changes.len(), 2);
344        let types: Vec<ChangeType> = result.changes.iter().map(|c| c.change_type).collect();
345        assert!(types.contains(&ChangeType::Deleted));
346        assert!(types.contains(&ChangeType::Added));
347    }
348
349    #[test]
350    fn test_content_hash_rename() {
351        let before = vec![make_entity("a::f::old", "old", "same content", "a.ts")];
352        let after = vec![make_entity("a::f::new", "new", "same content", "a.ts")];
353        let result = match_entities(&before, &after, "a.ts", None, None, None);
354        assert_eq!(result.changes.len(), 1);
355        assert_eq!(result.changes[0].change_type, ChangeType::Renamed);
356    }
357
358    #[test]
359    fn test_default_similarity() {
360        let a = make_entity("a", "a", "the quick brown fox", "a.ts");
361        let b = make_entity("b", "b", "the quick brown dog", "a.ts");
362        let score = default_similarity(&a, &b);
363        assert!(score > 0.5);
364        assert!(score < 1.0);
365    }
366}