Skip to main content

sem_core/model/
identity.rs

1use std::collections::{HashMap, HashSet};
2
3use super::change::{ChangeType, SemanticChange};
4use super::entity::SemanticEntity;
5
6pub struct MatchResult {
7    pub changes: Vec<SemanticChange>,
8}
9
10/// 3-phase entity matching algorithm:
11/// 1. Exact ID match — same entity ID in before/after → modified or unchanged
12/// 2. Content hash match — same hash, different ID → renamed or moved
13/// 3. Fuzzy similarity — >80% content similarity → probable rename
14pub fn match_entities(
15    before: &[SemanticEntity],
16    after: &[SemanticEntity],
17    _file_path: &str,
18    _similarity_fn: Option<&dyn Fn(&SemanticEntity, &SemanticEntity) -> f64>,
19    commit_sha: Option<&str>,
20    author: Option<&str>,
21) -> MatchResult {
22    let mut changes: Vec<SemanticChange> = Vec::new();
23    let mut matched_before: HashSet<&str> = HashSet::new();
24    let mut matched_after: HashSet<&str> = HashSet::new();
25
26    let before_by_id: HashMap<&str, &SemanticEntity> =
27        before.iter().map(|e| (e.id.as_str(), e)).collect();
28    let after_by_id: HashMap<&str, &SemanticEntity> =
29        after.iter().map(|e| (e.id.as_str(), e)).collect();
30
31    // Phase 1: Exact ID match
32    for (&id, after_entity) in &after_by_id {
33        if let Some(before_entity) = before_by_id.get(id) {
34            matched_before.insert(id);
35            matched_after.insert(id);
36
37            if before_entity.content_hash != after_entity.content_hash {
38                let structural_change = match (&before_entity.structural_hash, &after_entity.structural_hash) {
39                    (Some(before_sh), Some(after_sh)) => Some(before_sh != after_sh),
40                    _ => None,
41                };
42                changes.push(SemanticChange {
43                    id: format!("change::{id}"),
44                    entity_id: id.to_string(),
45                    change_type: ChangeType::Modified,
46                    entity_type: after_entity.entity_type.clone(),
47                    entity_name: after_entity.name.clone(),
48                    entity_line: after_entity.start_line,
49                    file_path: after_entity.file_path.clone(),
50                    old_entity_name: None,
51                    old_file_path: None,
52                    before_content: Some(before_entity.content.clone()),
53                    after_content: Some(after_entity.content.clone()),
54                    commit_sha: commit_sha.map(String::from),
55                    author: author.map(String::from),
56                    timestamp: None,
57                    structural_change,
58                });
59            }
60        }
61    }
62
63    // Collect unmatched
64    let unmatched_before: Vec<&SemanticEntity> = before
65        .iter()
66        .filter(|e| !matched_before.contains(e.id.as_str()))
67        .collect();
68    let unmatched_after: Vec<&SemanticEntity> = after
69        .iter()
70        .filter(|e| !matched_after.contains(e.id.as_str()))
71        .collect();
72
73    // Phase 2: Content hash match (rename/move detection)
74    let mut before_by_hash: HashMap<&str, Vec<&SemanticEntity>> = HashMap::new();
75    let mut before_by_structural: HashMap<&str, Vec<&SemanticEntity>> = HashMap::new();
76    for entity in &unmatched_before {
77        before_by_hash
78            .entry(entity.content_hash.as_str())
79            .or_default()
80            .push(entity);
81        if let Some(ref sh) = entity.structural_hash {
82            before_by_structural
83                .entry(sh.as_str())
84                .or_default()
85                .push(entity);
86        }
87    }
88
89    for after_entity in &unmatched_after {
90        if matched_after.contains(after_entity.id.as_str()) {
91            continue;
92        }
93        // Try exact content_hash first
94        let found = before_by_hash
95            .get_mut(after_entity.content_hash.as_str())
96            .and_then(|c| c.pop());
97        // Fall back to structural_hash (formatting/comment changes don't matter)
98        let found = found.or_else(|| {
99            after_entity.structural_hash.as_ref().and_then(|sh| {
100                before_by_structural.get_mut(sh.as_str()).and_then(|c| {
101                    c.iter()
102                        .position(|e| !matched_before.contains(e.id.as_str()))
103                        .map(|i| c.remove(i))
104                })
105            })
106        });
107
108        if let Some(before_entity) = found {
109            matched_before.insert(&before_entity.id);
110            matched_after.insert(&after_entity.id);
111
112            // If name and file are the same, only the parent qualifier in the ID changed
113            // (e.g. parent class was renamed). Skip — the entity itself is unchanged.
114            if before_entity.name == after_entity.name
115                && before_entity.file_path == after_entity.file_path
116                && before_entity.content_hash == after_entity.content_hash
117            {
118                continue;
119            }
120
121            let change_type = if before_entity.file_path != after_entity.file_path {
122                ChangeType::Moved
123            } else {
124                ChangeType::Renamed
125            };
126
127            let old_file_path = if before_entity.file_path != after_entity.file_path {
128                Some(before_entity.file_path.clone())
129            } else {
130                None
131            };
132
133            let old_entity_name = if before_entity.name != after_entity.name {
134                Some(before_entity.name.clone())
135            } else {
136                None
137            };
138
139            changes.push(SemanticChange {
140                id: format!("change::{}", after_entity.id),
141                entity_id: after_entity.id.clone(),
142                change_type,
143                entity_type: after_entity.entity_type.clone(),
144                entity_name: after_entity.name.clone(),
145                entity_line: after_entity.start_line,
146                file_path: after_entity.file_path.clone(),
147                old_entity_name,
148                old_file_path,
149                before_content: Some(before_entity.content.clone()),
150                after_content: Some(after_entity.content.clone()),
151                commit_sha: commit_sha.map(String::from),
152                author: author.map(String::from),
153                timestamp: None,
154                structural_change: None,
155            });
156        }
157    }
158
159    // Phase 3: Fuzzy similarity (>80% threshold)
160    // Optimized: pre-compute token sets once per entity, group by type
161    let still_unmatched_before: Vec<&SemanticEntity> = unmatched_before
162        .iter()
163        .filter(|e| !matched_before.contains(e.id.as_str()))
164        .copied()
165        .collect();
166    let still_unmatched_after: Vec<&SemanticEntity> = unmatched_after
167        .iter()
168        .filter(|e| !matched_after.contains(e.id.as_str()))
169        .copied()
170        .collect();
171
172    if !still_unmatched_before.is_empty() && !still_unmatched_after.is_empty() {
173        const THRESHOLD: f64 = 0.8;
174        const SIZE_RATIO_CUTOFF: f64 = 0.5;
175
176        // Pre-compute token sets once per entity (N+M instead of N×M allocations)
177        let before_sets: Vec<HashSet<&str>> = still_unmatched_before
178            .iter()
179            .map(|e| e.content.split_whitespace().collect())
180            .collect();
181        let after_sets: Vec<HashSet<&str>> = still_unmatched_after
182            .iter()
183            .map(|e| e.content.split_whitespace().collect())
184            .collect();
185
186        // Group before entities by type: O(sum(n_t × m_t)) instead of O(N×M)
187        let mut before_by_type: HashMap<&str, Vec<usize>> = HashMap::new();
188        for (i, e) in still_unmatched_before.iter().enumerate() {
189            before_by_type
190                .entry(e.entity_type.as_str())
191                .or_default()
192                .push(i);
193        }
194
195        for (ai, after_entity) in still_unmatched_after.iter().enumerate() {
196            let candidates = match before_by_type.get(after_entity.entity_type.as_str()) {
197                Some(indices) => indices,
198                None => continue,
199            };
200
201            let a_set = &after_sets[ai];
202            let a_len = a_set.len();
203            let mut best_idx: Option<usize> = None;
204            let mut best_score: f64 = 0.0;
205
206            for &bi in candidates {
207                if matched_before.contains(still_unmatched_before[bi].id.as_str()) {
208                    continue;
209                }
210
211                let b_set = &before_sets[bi];
212                let b_len = b_set.len();
213
214                // Size ratio filter using pre-computed set lengths
215                let (min_l, max_l) = if a_len < b_len {
216                    (a_len, b_len)
217                } else {
218                    (b_len, a_len)
219                };
220                if max_l > 0 && (min_l as f64 / max_l as f64) < SIZE_RATIO_CUTOFF {
221                    continue;
222                }
223
224                // Inline Jaccard on pre-computed sets
225                let intersection = a_set.intersection(b_set).count();
226                let union = a_len + b_len - intersection;
227                let score = if union == 0 {
228                    0.0
229                } else {
230                    intersection as f64 / union as f64
231                };
232
233                if score >= THRESHOLD && score > best_score {
234                    best_score = score;
235                    best_idx = Some(bi);
236                }
237            }
238
239            if let Some(bi) = best_idx {
240                let matched = still_unmatched_before[bi];
241                matched_before.insert(&matched.id);
242                matched_after.insert(&after_entity.id);
243
244                // If name and file are the same, only the parent qualifier changed.
245                if matched.name == after_entity.name
246                    && matched.file_path == after_entity.file_path
247                    && matched.content_hash == after_entity.content_hash
248                {
249                    continue;
250                }
251
252                let change_type = if matched.file_path != after_entity.file_path {
253                    ChangeType::Moved
254                } else {
255                    ChangeType::Renamed
256                };
257
258                let old_file_path = if matched.file_path != after_entity.file_path {
259                    Some(matched.file_path.clone())
260                } else {
261                    None
262                };
263
264                let old_entity_name = if matched.name != after_entity.name {
265                    Some(matched.name.clone())
266                } else {
267                    None
268                };
269
270                changes.push(SemanticChange {
271                    id: format!("change::{}", after_entity.id),
272                    entity_id: after_entity.id.clone(),
273                    change_type,
274                    entity_type: after_entity.entity_type.clone(),
275                    entity_name: after_entity.name.clone(),
276                    entity_line: after_entity.start_line,
277                    file_path: after_entity.file_path.clone(),
278                    old_entity_name,
279                    old_file_path,
280                    before_content: Some(matched.content.clone()),
281                    after_content: Some(after_entity.content.clone()),
282                    commit_sha: commit_sha.map(String::from),
283                    author: author.map(String::from),
284                    timestamp: None,
285                    structural_change: None,
286                });
287            }
288        }
289    }
290
291    // Remaining unmatched before = deleted
292    for entity in before.iter().filter(|e| !matched_before.contains(e.id.as_str())) {
293        changes.push(SemanticChange {
294            id: format!("change::deleted::{}", entity.id),
295            entity_id: entity.id.clone(),
296            change_type: ChangeType::Deleted,
297            entity_type: entity.entity_type.clone(),
298            entity_name: entity.name.clone(),
299            entity_line: entity.start_line,
300            file_path: entity.file_path.clone(),
301            old_entity_name: None,
302            old_file_path: None,
303            before_content: Some(entity.content.clone()),
304            after_content: None,
305            commit_sha: commit_sha.map(String::from),
306            author: author.map(String::from),
307            timestamp: None,
308            structural_change: None,
309        });
310    }
311
312    // Remaining unmatched after = added
313    for entity in after.iter().filter(|e| !matched_after.contains(e.id.as_str())) {
314        changes.push(SemanticChange {
315            id: format!("change::added::{}", entity.id),
316            entity_id: entity.id.clone(),
317            change_type: ChangeType::Added,
318            entity_type: entity.entity_type.clone(),
319            entity_name: entity.name.clone(),
320            entity_line: entity.start_line,
321            file_path: entity.file_path.clone(),
322            old_entity_name: None,
323            old_file_path: None,
324            before_content: None,
325            after_content: Some(entity.content.clone()),
326            commit_sha: commit_sha.map(String::from),
327            author: author.map(String::from),
328            timestamp: None,
329            structural_change: None,
330        });
331    }
332
333    MatchResult { changes }
334}
335
336/// Default content similarity using Jaccard index on whitespace-split tokens
337pub fn default_similarity(a: &SemanticEntity, b: &SemanticEntity) -> f64 {
338    let tokens_a: Vec<&str> = a.content.split_whitespace().collect();
339    let tokens_b: Vec<&str> = b.content.split_whitespace().collect();
340
341    // Early rejection: if token counts differ too much, Jaccard can't reach 0.8
342    let (min_c, max_c) = if tokens_a.len() < tokens_b.len() {
343        (tokens_a.len(), tokens_b.len())
344    } else {
345        (tokens_b.len(), tokens_a.len())
346    };
347    if max_c > 0 && (min_c as f64 / max_c as f64) < 0.6 {
348        return 0.0;
349    }
350
351    let set_a: HashSet<&str> = tokens_a.into_iter().collect();
352    let set_b: HashSet<&str> = tokens_b.into_iter().collect();
353
354    let intersection_size = set_a.intersection(&set_b).count();
355    let union_size = set_a.union(&set_b).count();
356
357    if union_size == 0 {
358        return 0.0;
359    }
360
361    intersection_size as f64 / union_size as f64
362}
363
364#[cfg(test)]
365mod tests {
366    use super::*;
367    use crate::utils::hash::content_hash;
368
369    fn make_entity(id: &str, name: &str, content: &str, file_path: &str) -> SemanticEntity {
370        SemanticEntity {
371            id: id.to_string(),
372            file_path: file_path.to_string(),
373            entity_type: "function".to_string(),
374            name: name.to_string(),
375            parent_id: None,
376            content: content.to_string(),
377            content_hash: content_hash(content),
378            structural_hash: None,
379            start_line: 1,
380            end_line: 1,
381            metadata: None,
382        }
383    }
384
385    #[test]
386    fn test_exact_match_modified() {
387        let before = vec![make_entity("a::f::foo", "foo", "old content", "a.ts")];
388        let after = vec![make_entity("a::f::foo", "foo", "new content", "a.ts")];
389        let result = match_entities(&before, &after, "a.ts", None, None, None);
390        assert_eq!(result.changes.len(), 1);
391        assert_eq!(result.changes[0].change_type, ChangeType::Modified);
392    }
393
394    #[test]
395    fn test_exact_match_unchanged() {
396        let before = vec![make_entity("a::f::foo", "foo", "same", "a.ts")];
397        let after = vec![make_entity("a::f::foo", "foo", "same", "a.ts")];
398        let result = match_entities(&before, &after, "a.ts", None, None, None);
399        assert_eq!(result.changes.len(), 0);
400    }
401
402    #[test]
403    fn test_added_deleted() {
404        let before = vec![make_entity("a::f::old", "old", "content", "a.ts")];
405        let after = vec![make_entity("a::f::new", "new", "different", "a.ts")];
406        let result = match_entities(&before, &after, "a.ts", None, None, None);
407        assert_eq!(result.changes.len(), 2);
408        let types: Vec<ChangeType> = result.changes.iter().map(|c| c.change_type).collect();
409        assert!(types.contains(&ChangeType::Deleted));
410        assert!(types.contains(&ChangeType::Added));
411    }
412
413    #[test]
414    fn test_content_hash_rename() {
415        let before = vec![make_entity("a::f::old", "old", "same content", "a.ts")];
416        let after = vec![make_entity("a::f::new", "new", "same content", "a.ts")];
417        let result = match_entities(&before, &after, "a.ts", None, None, None);
418        assert_eq!(result.changes.len(), 1);
419        assert_eq!(result.changes[0].change_type, ChangeType::Renamed);
420    }
421
422    #[test]
423    fn test_default_similarity() {
424        let a = make_entity("a", "a", "the quick brown fox", "a.ts");
425        let b = make_entity("b", "b", "the quick brown dog", "a.ts");
426        let score = default_similarity(&a, &b);
427        assert!(score > 0.5);
428        assert!(score < 1.0);
429    }
430}