Skip to main content

sem_core/model/
identity.rs

1use std::collections::{HashMap, HashSet};
2
3use super::change::{ChangeType, SemanticChange};
4use super::entity::SemanticEntity;
5
6pub struct MatchResult {
7    pub changes: Vec<SemanticChange>,
8}
9
10/// 3-phase entity matching algorithm:
11/// 1. Exact ID match — same entity ID in before/after → modified or unchanged
12/// 2. Content hash match — same hash, different ID → renamed or moved
13/// 3. Fuzzy similarity — >80% content similarity → probable rename
14pub fn match_entities(
15    before: &[SemanticEntity],
16    after: &[SemanticEntity],
17    _file_path: &str,
18    similarity_fn: Option<&dyn Fn(&SemanticEntity, &SemanticEntity) -> f64>,
19    commit_sha: Option<&str>,
20    author: Option<&str>,
21) -> MatchResult {
22    let mut changes: Vec<SemanticChange> = Vec::new();
23    let mut matched_before: HashSet<&str> = HashSet::new();
24    let mut matched_after: HashSet<&str> = HashSet::new();
25
26    let before_by_id: HashMap<&str, &SemanticEntity> =
27        before.iter().map(|e| (e.id.as_str(), e)).collect();
28    let after_by_id: HashMap<&str, &SemanticEntity> =
29        after.iter().map(|e| (e.id.as_str(), e)).collect();
30
31    // Phase 1: Exact ID match
32    for (&id, after_entity) in &after_by_id {
33        if let Some(before_entity) = before_by_id.get(id) {
34            matched_before.insert(id);
35            matched_after.insert(id);
36
37            if before_entity.content_hash != after_entity.content_hash {
38                let structural_change = match (&before_entity.structural_hash, &after_entity.structural_hash) {
39                    (Some(before_sh), Some(after_sh)) => Some(before_sh != after_sh),
40                    _ => None,
41                };
42                changes.push(SemanticChange {
43                    id: format!("change::{id}"),
44                    entity_id: id.to_string(),
45                    change_type: ChangeType::Modified,
46                    entity_type: after_entity.entity_type.clone(),
47                    entity_name: after_entity.name.clone(),
48                    file_path: after_entity.file_path.clone(),
49                    old_entity_name: None,
50                    old_file_path: None,
51                    before_content: Some(before_entity.content.clone()),
52                    after_content: Some(after_entity.content.clone()),
53                    commit_sha: commit_sha.map(String::from),
54                    author: author.map(String::from),
55                    timestamp: None,
56                    structural_change,
57                });
58            }
59        }
60    }
61
62    // Collect unmatched
63    let unmatched_before: Vec<&SemanticEntity> = before
64        .iter()
65        .filter(|e| !matched_before.contains(e.id.as_str()))
66        .collect();
67    let unmatched_after: Vec<&SemanticEntity> = after
68        .iter()
69        .filter(|e| !matched_after.contains(e.id.as_str()))
70        .collect();
71
72    // Phase 2: Content hash match (rename/move detection)
73    let mut before_by_hash: HashMap<&str, Vec<&SemanticEntity>> = HashMap::new();
74    let mut before_by_structural: HashMap<&str, Vec<&SemanticEntity>> = HashMap::new();
75    for entity in &unmatched_before {
76        before_by_hash
77            .entry(entity.content_hash.as_str())
78            .or_default()
79            .push(entity);
80        if let Some(ref sh) = entity.structural_hash {
81            before_by_structural
82                .entry(sh.as_str())
83                .or_default()
84                .push(entity);
85        }
86    }
87
88    for after_entity in &unmatched_after {
89        if matched_after.contains(after_entity.id.as_str()) {
90            continue;
91        }
92        // Try exact content_hash first
93        let found = before_by_hash
94            .get_mut(after_entity.content_hash.as_str())
95            .and_then(|c| c.pop());
96        // Fall back to structural_hash (formatting/comment changes don't matter)
97        let found = found.or_else(|| {
98            after_entity.structural_hash.as_ref().and_then(|sh| {
99                before_by_structural.get_mut(sh.as_str()).and_then(|c| {
100                    c.iter()
101                        .position(|e| !matched_before.contains(e.id.as_str()))
102                        .map(|i| c.remove(i))
103                })
104            })
105        });
106
107        if let Some(before_entity) = found {
108            matched_before.insert(&before_entity.id);
109            matched_after.insert(&after_entity.id);
110
111            // If name and file are the same, only the parent qualifier in the ID changed
112            // (e.g. parent class was renamed). Skip — the entity itself is unchanged.
113            if before_entity.name == after_entity.name
114                && before_entity.file_path == after_entity.file_path
115                && before_entity.content_hash == after_entity.content_hash
116            {
117                continue;
118            }
119
120            let change_type = if before_entity.file_path != after_entity.file_path {
121                ChangeType::Moved
122            } else {
123                ChangeType::Renamed
124            };
125
126            let old_file_path = if before_entity.file_path != after_entity.file_path {
127                Some(before_entity.file_path.clone())
128            } else {
129                None
130            };
131
132            let old_entity_name = if before_entity.name != after_entity.name {
133                Some(before_entity.name.clone())
134            } else {
135                None
136            };
137
138            changes.push(SemanticChange {
139                id: format!("change::{}", after_entity.id),
140                entity_id: after_entity.id.clone(),
141                change_type,
142                entity_type: after_entity.entity_type.clone(),
143                entity_name: after_entity.name.clone(),
144                file_path: after_entity.file_path.clone(),
145                old_entity_name,
146                old_file_path,
147                before_content: Some(before_entity.content.clone()),
148                after_content: Some(after_entity.content.clone()),
149                commit_sha: commit_sha.map(String::from),
150                author: author.map(String::from),
151                timestamp: None,
152                structural_change: None,
153            });
154        }
155    }
156
157    // Phase 3: Fuzzy similarity (>80% threshold)
158    let still_unmatched_before: Vec<&SemanticEntity> = unmatched_before
159        .iter()
160        .filter(|e| !matched_before.contains(e.id.as_str()))
161        .copied()
162        .collect();
163    let still_unmatched_after: Vec<&SemanticEntity> = unmatched_after
164        .iter()
165        .filter(|e| !matched_after.contains(e.id.as_str()))
166        .copied()
167        .collect();
168
169    if let Some(sim_fn) = similarity_fn {
170        if !still_unmatched_before.is_empty() && !still_unmatched_after.is_empty() {
171            const THRESHOLD: f64 = 0.8;
172            // Size ratio filter: pairs with very different content lengths can't reach 0.8 Jaccard
173            const SIZE_RATIO_CUTOFF: f64 = 0.5;
174
175            // Pre-compute content lengths for O(1) size filtering
176            let before_lens: Vec<usize> = still_unmatched_before
177                .iter()
178                .map(|e| e.content.split_whitespace().count())
179                .collect();
180            let after_lens: Vec<usize> = still_unmatched_after
181                .iter()
182                .map(|e| e.content.split_whitespace().count())
183                .collect();
184
185            for (ai, after_entity) in still_unmatched_after.iter().enumerate() {
186                let mut best_match: Option<&SemanticEntity> = None;
187                let mut best_score: f64 = 0.0;
188                let a_len = after_lens[ai];
189
190                for (bi, before_entity) in still_unmatched_before.iter().enumerate() {
191                    if matched_before.contains(before_entity.id.as_str()) {
192                        continue;
193                    }
194                    if before_entity.entity_type != after_entity.entity_type {
195                        continue;
196                    }
197
198                    // Early exit: skip pairs where token count ratio is too different
199                    let b_len = before_lens[bi];
200                    let (min_l, max_l) = if a_len < b_len { (a_len, b_len) } else { (b_len, a_len) };
201                    if max_l > 0 && (min_l as f64 / max_l as f64) < SIZE_RATIO_CUTOFF {
202                        continue;
203                    }
204
205                    let score = sim_fn(before_entity, after_entity);
206                    if score > best_score && score >= THRESHOLD {
207                        best_score = score;
208                        best_match = Some(before_entity);
209                    }
210                }
211
212                if let Some(matched) = best_match {
213                    matched_before.insert(&matched.id);
214                    matched_after.insert(&after_entity.id);
215
216                    // If name and file are the same, only the parent qualifier changed.
217                    if matched.name == after_entity.name
218                        && matched.file_path == after_entity.file_path
219                        && matched.content_hash == after_entity.content_hash
220                    {
221                        continue;
222                    }
223
224                    let change_type = if matched.file_path != after_entity.file_path {
225                        ChangeType::Moved
226                    } else {
227                        ChangeType::Renamed
228                    };
229
230                    let old_file_path = if matched.file_path != after_entity.file_path {
231                        Some(matched.file_path.clone())
232                    } else {
233                        None
234                    };
235
236                    let old_entity_name = if matched.name != after_entity.name {
237                        Some(matched.name.clone())
238                    } else {
239                        None
240                    };
241
242                    changes.push(SemanticChange {
243                        id: format!("change::{}", after_entity.id),
244                        entity_id: after_entity.id.clone(),
245                        change_type,
246                        entity_type: after_entity.entity_type.clone(),
247                        entity_name: after_entity.name.clone(),
248                        file_path: after_entity.file_path.clone(),
249                        old_entity_name,
250                        old_file_path,
251                        before_content: Some(matched.content.clone()),
252                        after_content: Some(after_entity.content.clone()),
253                        commit_sha: commit_sha.map(String::from),
254                        author: author.map(String::from),
255                        timestamp: None,
256                        structural_change: None,
257                    });
258                }
259            }
260        }
261    }
262
263    // Remaining unmatched before = deleted
264    for entity in before.iter().filter(|e| !matched_before.contains(e.id.as_str())) {
265        changes.push(SemanticChange {
266            id: format!("change::deleted::{}", entity.id),
267            entity_id: entity.id.clone(),
268            change_type: ChangeType::Deleted,
269            entity_type: entity.entity_type.clone(),
270            entity_name: entity.name.clone(),
271            file_path: entity.file_path.clone(),
272            old_entity_name: None,
273            old_file_path: None,
274            before_content: Some(entity.content.clone()),
275            after_content: None,
276            commit_sha: commit_sha.map(String::from),
277            author: author.map(String::from),
278            timestamp: None,
279            structural_change: None,
280        });
281    }
282
283    // Remaining unmatched after = added
284    for entity in after.iter().filter(|e| !matched_after.contains(e.id.as_str())) {
285        changes.push(SemanticChange {
286            id: format!("change::added::{}", entity.id),
287            entity_id: entity.id.clone(),
288            change_type: ChangeType::Added,
289            entity_type: entity.entity_type.clone(),
290            entity_name: entity.name.clone(),
291            file_path: entity.file_path.clone(),
292            old_entity_name: None,
293            old_file_path: None,
294            before_content: None,
295            after_content: Some(entity.content.clone()),
296            commit_sha: commit_sha.map(String::from),
297            author: author.map(String::from),
298            timestamp: None,
299            structural_change: None,
300        });
301    }
302
303    MatchResult { changes }
304}
305
306/// Default content similarity using Jaccard index on whitespace-split tokens
307pub fn default_similarity(a: &SemanticEntity, b: &SemanticEntity) -> f64 {
308    let tokens_a: Vec<&str> = a.content.split_whitespace().collect();
309    let tokens_b: Vec<&str> = b.content.split_whitespace().collect();
310
311    // Early rejection: if token counts differ too much, Jaccard can't reach 0.8
312    let (min_c, max_c) = if tokens_a.len() < tokens_b.len() {
313        (tokens_a.len(), tokens_b.len())
314    } else {
315        (tokens_b.len(), tokens_a.len())
316    };
317    if max_c > 0 && (min_c as f64 / max_c as f64) < 0.6 {
318        return 0.0;
319    }
320
321    let set_a: HashSet<&str> = tokens_a.into_iter().collect();
322    let set_b: HashSet<&str> = tokens_b.into_iter().collect();
323
324    let intersection_size = set_a.intersection(&set_b).count();
325    let union_size = set_a.union(&set_b).count();
326
327    if union_size == 0 {
328        return 0.0;
329    }
330
331    intersection_size as f64 / union_size as f64
332}
333
334#[cfg(test)]
335mod tests {
336    use super::*;
337    use crate::utils::hash::content_hash;
338
339    fn make_entity(id: &str, name: &str, content: &str, file_path: &str) -> SemanticEntity {
340        SemanticEntity {
341            id: id.to_string(),
342            file_path: file_path.to_string(),
343            entity_type: "function".to_string(),
344            name: name.to_string(),
345            parent_id: None,
346            content: content.to_string(),
347            content_hash: content_hash(content),
348            structural_hash: None,
349            start_line: 1,
350            end_line: 1,
351            metadata: None,
352        }
353    }
354
355    #[test]
356    fn test_exact_match_modified() {
357        let before = vec![make_entity("a::f::foo", "foo", "old content", "a.ts")];
358        let after = vec![make_entity("a::f::foo", "foo", "new content", "a.ts")];
359        let result = match_entities(&before, &after, "a.ts", None, None, None);
360        assert_eq!(result.changes.len(), 1);
361        assert_eq!(result.changes[0].change_type, ChangeType::Modified);
362    }
363
364    #[test]
365    fn test_exact_match_unchanged() {
366        let before = vec![make_entity("a::f::foo", "foo", "same", "a.ts")];
367        let after = vec![make_entity("a::f::foo", "foo", "same", "a.ts")];
368        let result = match_entities(&before, &after, "a.ts", None, None, None);
369        assert_eq!(result.changes.len(), 0);
370    }
371
372    #[test]
373    fn test_added_deleted() {
374        let before = vec![make_entity("a::f::old", "old", "content", "a.ts")];
375        let after = vec![make_entity("a::f::new", "new", "different", "a.ts")];
376        let result = match_entities(&before, &after, "a.ts", None, None, None);
377        assert_eq!(result.changes.len(), 2);
378        let types: Vec<ChangeType> = result.changes.iter().map(|c| c.change_type).collect();
379        assert!(types.contains(&ChangeType::Deleted));
380        assert!(types.contains(&ChangeType::Added));
381    }
382
383    #[test]
384    fn test_content_hash_rename() {
385        let before = vec![make_entity("a::f::old", "old", "same content", "a.ts")];
386        let after = vec![make_entity("a::f::new", "new", "same content", "a.ts")];
387        let result = match_entities(&before, &after, "a.ts", None, None, None);
388        assert_eq!(result.changes.len(), 1);
389        assert_eq!(result.changes[0].change_type, ChangeType::Renamed);
390    }
391
392    #[test]
393    fn test_default_similarity() {
394        let a = make_entity("a", "a", "the quick brown fox", "a.ts");
395        let b = make_entity("b", "b", "the quick brown dog", "a.ts");
396        let score = default_similarity(&a, &b);
397        assert!(score > 0.5);
398        assert!(score < 1.0);
399    }
400}