Skip to main content

sem_core/model/
identity.rs

1use std::collections::{HashMap, HashSet};
2
3use super::change::{ChangeType, SemanticChange};
4use super::entity::SemanticEntity;
5
6pub struct MatchResult {
7    pub changes: Vec<SemanticChange>,
8}
9
10/// 3-phase entity matching algorithm:
11/// 1. Exact ID match — same entity ID in before/after → modified or unchanged
12/// 2. Content hash match — same hash, different ID → renamed or moved
13/// 3. Fuzzy similarity — >80% content similarity → probable rename
14pub fn match_entities(
15    before: &[SemanticEntity],
16    after: &[SemanticEntity],
17    _file_path: &str,
18    _similarity_fn: Option<&dyn Fn(&SemanticEntity, &SemanticEntity) -> f64>,
19    commit_sha: Option<&str>,
20    author: Option<&str>,
21) -> MatchResult {
22    let mut changes: Vec<SemanticChange> = Vec::new();
23    let mut matched_before: HashSet<&str> = HashSet::new();
24    let mut matched_after: HashSet<&str> = HashSet::new();
25
26    let before_by_id: HashMap<&str, &SemanticEntity> =
27        before.iter().map(|e| (e.id.as_str(), e)).collect();
28    let after_by_id: HashMap<&str, &SemanticEntity> =
29        after.iter().map(|e| (e.id.as_str(), e)).collect();
30
31    // Phase 1: Exact ID match
32    for (&id, after_entity) in &after_by_id {
33        if let Some(before_entity) = before_by_id.get(id) {
34            matched_before.insert(id);
35            matched_after.insert(id);
36
37            if before_entity.content_hash != after_entity.content_hash {
38                let structural_change = match (&before_entity.structural_hash, &after_entity.structural_hash) {
39                    (Some(before_sh), Some(after_sh)) => Some(before_sh != after_sh),
40                    _ => None,
41                };
42                changes.push(SemanticChange {
43                    id: format!("change::{id}"),
44                    entity_id: id.to_string(),
45                    change_type: ChangeType::Modified,
46                    entity_type: after_entity.entity_type.clone(),
47                    entity_name: after_entity.name.clone(),
48                    entity_line: after_entity.start_line,
49                    file_path: after_entity.file_path.clone(),
50                    old_entity_name: None,
51                    old_file_path: None,
52                    before_content: Some(before_entity.content.clone()),
53                    after_content: Some(after_entity.content.clone()),
54                    commit_sha: commit_sha.map(String::from),
55                    author: author.map(String::from),
56                    timestamp: None,
57                    structural_change,
58                });
59            }
60        }
61    }
62
63    // Collect unmatched
64    let unmatched_before: Vec<&SemanticEntity> = before
65        .iter()
66        .filter(|e| !matched_before.contains(e.id.as_str()))
67        .collect();
68    let unmatched_after: Vec<&SemanticEntity> = after
69        .iter()
70        .filter(|e| !matched_after.contains(e.id.as_str()))
71        .collect();
72
73    // Phase 2: Content hash match (rename/move detection)
74    let mut before_by_hash: HashMap<&str, Vec<&SemanticEntity>> = HashMap::new();
75    let mut before_by_structural: HashMap<&str, Vec<&SemanticEntity>> = HashMap::new();
76    for entity in &unmatched_before {
77        before_by_hash
78            .entry(entity.content_hash.as_str())
79            .or_default()
80            .push(entity);
81        if let Some(ref sh) = entity.structural_hash {
82            before_by_structural
83                .entry(sh.as_str())
84                .or_default()
85                .push(entity);
86        }
87    }
88
89    for after_entity in &unmatched_after {
90        if matched_after.contains(after_entity.id.as_str()) {
91            continue;
92        }
93        // Try exact content_hash first
94        let found = before_by_hash
95            .get_mut(after_entity.content_hash.as_str())
96            .and_then(|c| c.pop());
97        // Fall back to structural_hash (formatting/comment changes don't matter)
98        let found = found.or_else(|| {
99            after_entity.structural_hash.as_ref().and_then(|sh| {
100                before_by_structural.get_mut(sh.as_str()).and_then(|c| {
101                    c.iter()
102                        .position(|e| !matched_before.contains(e.id.as_str()))
103                        .map(|i| c.remove(i))
104                })
105            })
106        });
107
108        if let Some(before_entity) = found {
109            matched_before.insert(&before_entity.id);
110            matched_after.insert(&after_entity.id);
111
112            // If name and file are the same, only the parent qualifier in the ID changed
113            // (e.g. parent class was renamed). Skip — the entity itself is unchanged.
114            if before_entity.name == after_entity.name
115                && before_entity.file_path == after_entity.file_path
116                && before_entity.content_hash == after_entity.content_hash
117            {
118                continue;
119            }
120
121            let change_type = if before_entity.file_path != after_entity.file_path {
122                ChangeType::Moved
123            } else {
124                ChangeType::Renamed
125            };
126
127            let old_file_path = if before_entity.file_path != after_entity.file_path {
128                Some(before_entity.file_path.clone())
129            } else {
130                None
131            };
132
133            let old_entity_name = if before_entity.name != after_entity.name {
134                Some(before_entity.name.clone())
135            } else {
136                None
137            };
138
139            changes.push(SemanticChange {
140                id: format!("change::{}", after_entity.id),
141                entity_id: after_entity.id.clone(),
142                change_type,
143                entity_type: after_entity.entity_type.clone(),
144                entity_name: after_entity.name.clone(),
145                entity_line: after_entity.start_line,
146                file_path: after_entity.file_path.clone(),
147                old_entity_name,
148                old_file_path,
149                before_content: Some(before_entity.content.clone()),
150                after_content: Some(after_entity.content.clone()),
151                commit_sha: commit_sha.map(String::from),
152                author: author.map(String::from),
153                timestamp: None,
154                structural_change: None,
155            });
156        }
157    }
158
159    // Phase 3: Fuzzy similarity (>80% threshold)
160    // Optimized: pre-compute token sets once per entity, group by type
161    let still_unmatched_before: Vec<&SemanticEntity> = unmatched_before
162        .iter()
163        .filter(|e| !matched_before.contains(e.id.as_str()))
164        .copied()
165        .collect();
166    let still_unmatched_after: Vec<&SemanticEntity> = unmatched_after
167        .iter()
168        .filter(|e| !matched_after.contains(e.id.as_str()))
169        .copied()
170        .collect();
171
172    if !still_unmatched_before.is_empty() && !still_unmatched_after.is_empty() {
173        const THRESHOLD: f64 = 0.8;
174        const SIZE_RATIO_CUTOFF: f64 = 0.5;
175
176        // Pre-compute token sets once per entity (N+M instead of N×M allocations)
177        let before_sets: Vec<HashSet<&str>> = still_unmatched_before
178            .iter()
179            .map(|e| e.content.split_whitespace().collect())
180            .collect();
181        let after_sets: Vec<HashSet<&str>> = still_unmatched_after
182            .iter()
183            .map(|e| e.content.split_whitespace().collect())
184            .collect();
185
186        // Group before entities by type: O(sum(n_t × m_t)) instead of O(N×M)
187        let mut before_by_type: HashMap<&str, Vec<usize>> = HashMap::new();
188        for (i, e) in still_unmatched_before.iter().enumerate() {
189            before_by_type
190                .entry(e.entity_type.as_str())
191                .or_default()
192                .push(i);
193        }
194
195        for (ai, after_entity) in still_unmatched_after.iter().enumerate() {
196            let candidates = match before_by_type.get(after_entity.entity_type.as_str()) {
197                Some(indices) => indices,
198                None => continue,
199            };
200
201            let a_set = &after_sets[ai];
202            let a_len = a_set.len();
203            let mut best_idx: Option<usize> = None;
204            let mut best_score: f64 = 0.0;
205
206            for &bi in candidates {
207                if matched_before.contains(still_unmatched_before[bi].id.as_str()) {
208                    continue;
209                }
210
211                let b_set = &before_sets[bi];
212                let b_len = b_set.len();
213
214                // Size ratio filter using pre-computed set lengths
215                let (min_l, max_l) = if a_len < b_len {
216                    (a_len, b_len)
217                } else {
218                    (b_len, a_len)
219                };
220                if max_l > 0 && (min_l as f64 / max_l as f64) < SIZE_RATIO_CUTOFF {
221                    continue;
222                }
223
224                // Inline Jaccard on pre-computed sets
225                let intersection = a_set.intersection(b_set).count();
226                let union = a_len + b_len - intersection;
227                let score = if union == 0 {
228                    0.0
229                } else {
230                    intersection as f64 / union as f64
231                };
232
233                if score >= THRESHOLD && score > best_score {
234                    best_score = score;
235                    best_idx = Some(bi);
236                }
237            }
238
239            if let Some(bi) = best_idx {
240                let matched = still_unmatched_before[bi];
241                matched_before.insert(&matched.id);
242                matched_after.insert(&after_entity.id);
243
244                // If name and file are the same, only the parent qualifier changed.
245                if matched.name == after_entity.name
246                    && matched.file_path == after_entity.file_path
247                    && matched.content_hash == after_entity.content_hash
248                {
249                    continue;
250                }
251
252                let change_type = if matched.file_path != after_entity.file_path {
253                    ChangeType::Moved
254                } else {
255                    ChangeType::Renamed
256                };
257
258                let old_file_path = if matched.file_path != after_entity.file_path {
259                    Some(matched.file_path.clone())
260                } else {
261                    None
262                };
263
264                let old_entity_name = if matched.name != after_entity.name {
265                    Some(matched.name.clone())
266                } else {
267                    None
268                };
269
270                changes.push(SemanticChange {
271                    id: format!("change::{}", after_entity.id),
272                    entity_id: after_entity.id.clone(),
273                    change_type,
274                    entity_type: after_entity.entity_type.clone(),
275                    entity_name: after_entity.name.clone(),
276                    entity_line: after_entity.start_line,
277                    file_path: after_entity.file_path.clone(),
278                    old_entity_name,
279                    old_file_path,
280                    before_content: Some(matched.content.clone()),
281                    after_content: Some(after_entity.content.clone()),
282                    commit_sha: commit_sha.map(String::from),
283                    author: author.map(String::from),
284                    timestamp: None,
285                    structural_change: None,
286                });
287            }
288        }
289    }
290
291    // Remaining unmatched before = deleted
292    for entity in before.iter().filter(|e| !matched_before.contains(e.id.as_str())) {
293        changes.push(SemanticChange {
294            id: format!("change::deleted::{}", entity.id),
295            entity_id: entity.id.clone(),
296            change_type: ChangeType::Deleted,
297            entity_type: entity.entity_type.clone(),
298            entity_name: entity.name.clone(),
299            entity_line: entity.start_line,
300            file_path: entity.file_path.clone(),
301            old_entity_name: None,
302            old_file_path: None,
303            before_content: Some(entity.content.clone()),
304            after_content: None,
305            commit_sha: commit_sha.map(String::from),
306            author: author.map(String::from),
307            timestamp: None,
308            structural_change: None,
309        });
310    }
311
312    // Remaining unmatched after = added
313    for entity in after.iter().filter(|e| !matched_after.contains(e.id.as_str())) {
314        changes.push(SemanticChange {
315            id: format!("change::added::{}", entity.id),
316            entity_id: entity.id.clone(),
317            change_type: ChangeType::Added,
318            entity_type: entity.entity_type.clone(),
319            entity_name: entity.name.clone(),
320            entity_line: entity.start_line,
321            file_path: entity.file_path.clone(),
322            old_entity_name: None,
323            old_file_path: None,
324            before_content: None,
325            after_content: Some(entity.content.clone()),
326            commit_sha: commit_sha.map(String::from),
327            author: author.map(String::from),
328            timestamp: None,
329            structural_change: None,
330        });
331    }
332
333    // Deduplicate: when a parent (class) is Modified and one or more of its
334    // children (methods) are also Modified, drop the parent. The child diffs
335    // are more specific and the parent body overlaps with them.
336    // Only applies to Modified; Added/Deleted should still show all entities.
337    let modified_ids: HashSet<&str> = changes
338        .iter()
339        .filter(|c| c.change_type == ChangeType::Modified)
340        .map(|c| c.entity_id.as_str())
341        .collect();
342
343    if modified_ids.len() > 1 {
344        let mut parents_to_remove: HashSet<&str> = HashSet::new();
345        for entity in after.iter().chain(before.iter()) {
346            if let Some(ref pid) = entity.parent_id {
347                if modified_ids.contains(entity.id.as_str())
348                    && modified_ids.contains(pid.as_str())
349                {
350                    parents_to_remove.insert(pid.as_str());
351                }
352            }
353        }
354
355        if !parents_to_remove.is_empty() {
356            changes.retain(|c| {
357                !(c.change_type == ChangeType::Modified
358                    && parents_to_remove.contains(c.entity_id.as_str()))
359            });
360        }
361    }
362
363    MatchResult { changes }
364}
365
366/// Default content similarity using Jaccard index on whitespace-split tokens
367pub fn default_similarity(a: &SemanticEntity, b: &SemanticEntity) -> f64 {
368    let tokens_a: Vec<&str> = a.content.split_whitespace().collect();
369    let tokens_b: Vec<&str> = b.content.split_whitespace().collect();
370
371    // Early rejection: if token counts differ too much, Jaccard can't reach 0.8
372    let (min_c, max_c) = if tokens_a.len() < tokens_b.len() {
373        (tokens_a.len(), tokens_b.len())
374    } else {
375        (tokens_b.len(), tokens_a.len())
376    };
377    if max_c > 0 && (min_c as f64 / max_c as f64) < 0.6 {
378        return 0.0;
379    }
380
381    let set_a: HashSet<&str> = tokens_a.into_iter().collect();
382    let set_b: HashSet<&str> = tokens_b.into_iter().collect();
383
384    let intersection_size = set_a.intersection(&set_b).count();
385    let union_size = set_a.union(&set_b).count();
386
387    if union_size == 0 {
388        return 0.0;
389    }
390
391    intersection_size as f64 / union_size as f64
392}
393
394#[cfg(test)]
395mod tests {
396    use super::*;
397    use crate::utils::hash::content_hash;
398
399    fn make_entity(id: &str, name: &str, content: &str, file_path: &str) -> SemanticEntity {
400        SemanticEntity {
401            id: id.to_string(),
402            file_path: file_path.to_string(),
403            entity_type: "function".to_string(),
404            name: name.to_string(),
405            parent_id: None,
406            content: content.to_string(),
407            content_hash: content_hash(content),
408            structural_hash: None,
409            start_line: 1,
410            end_line: 1,
411            metadata: None,
412        }
413    }
414
415    #[test]
416    fn test_exact_match_modified() {
417        let before = vec![make_entity("a::f::foo", "foo", "old content", "a.ts")];
418        let after = vec![make_entity("a::f::foo", "foo", "new content", "a.ts")];
419        let result = match_entities(&before, &after, "a.ts", None, None, None);
420        assert_eq!(result.changes.len(), 1);
421        assert_eq!(result.changes[0].change_type, ChangeType::Modified);
422    }
423
424    #[test]
425    fn test_exact_match_unchanged() {
426        let before = vec![make_entity("a::f::foo", "foo", "same", "a.ts")];
427        let after = vec![make_entity("a::f::foo", "foo", "same", "a.ts")];
428        let result = match_entities(&before, &after, "a.ts", None, None, None);
429        assert_eq!(result.changes.len(), 0);
430    }
431
432    #[test]
433    fn test_added_deleted() {
434        let before = vec![make_entity("a::f::old", "old", "content", "a.ts")];
435        let after = vec![make_entity("a::f::new", "new", "different", "a.ts")];
436        let result = match_entities(&before, &after, "a.ts", None, None, None);
437        assert_eq!(result.changes.len(), 2);
438        let types: Vec<ChangeType> = result.changes.iter().map(|c| c.change_type).collect();
439        assert!(types.contains(&ChangeType::Deleted));
440        assert!(types.contains(&ChangeType::Added));
441    }
442
443    #[test]
444    fn test_content_hash_rename() {
445        let before = vec![make_entity("a::f::old", "old", "same content", "a.ts")];
446        let after = vec![make_entity("a::f::new", "new", "same content", "a.ts")];
447        let result = match_entities(&before, &after, "a.ts", None, None, None);
448        assert_eq!(result.changes.len(), 1);
449        assert_eq!(result.changes[0].change_type, ChangeType::Renamed);
450    }
451
452    #[test]
453    fn test_parent_child_dedup_class_method() {
454        // Class entity contains the method body in its content.
455        // parent_id stores the full entity ID of the parent.
456        let class_before = SemanticEntity {
457            id: "a.ts::class::DataStack".to_string(),
458            file_path: "a.ts".to_string(),
459            entity_type: "class".to_string(),
460            name: "DataStack".to_string(),
461            parent_id: None,
462            content: "class DataStack { constructor() {} genPg() { old } }".to_string(),
463            content_hash: content_hash("class DataStack { constructor() {} genPg() { old } }"),
464            structural_hash: None,
465            start_line: 1,
466            end_line: 10,
467            metadata: None,
468        };
469        let method_before = SemanticEntity {
470            id: "a.ts::a.ts::class::DataStack::genPg".to_string(),
471            file_path: "a.ts".to_string(),
472            entity_type: "method".to_string(),
473            name: "genPg".to_string(),
474            parent_id: Some("a.ts::class::DataStack".to_string()),
475            content: "genPg() { old }".to_string(),
476            content_hash: content_hash("genPg() { old }"),
477            structural_hash: None,
478            start_line: 5,
479            end_line: 8,
480            metadata: None,
481        };
482
483        let class_after = SemanticEntity {
484            id: "a.ts::class::DataStack".to_string(),
485            file_path: "a.ts".to_string(),
486            entity_type: "class".to_string(),
487            name: "DataStack".to_string(),
488            parent_id: None,
489            content: "class DataStack { constructor() {} genPg() { new } }".to_string(),
490            content_hash: content_hash("class DataStack { constructor() {} genPg() { new } }"),
491            structural_hash: None,
492            start_line: 1,
493            end_line: 10,
494            metadata: None,
495        };
496        let method_after = SemanticEntity {
497            id: "a.ts::a.ts::class::DataStack::genPg".to_string(),
498            file_path: "a.ts".to_string(),
499            entity_type: "method".to_string(),
500            name: "genPg".to_string(),
501            parent_id: Some("a.ts::class::DataStack".to_string()),
502            content: "genPg() { new }".to_string(),
503            content_hash: content_hash("genPg() { new }"),
504            structural_hash: None,
505            start_line: 5,
506            end_line: 8,
507            metadata: None,
508        };
509
510        let before = vec![class_before, method_before];
511        let after = vec![class_after, method_after];
512        let result = match_entities(&before, &after, "a.ts", None, None, None);
513
514        // Should only report the method change, not the class
515        assert_eq!(result.changes.len(), 1);
516        assert_eq!(result.changes[0].entity_name, "genPg");
517        assert_eq!(result.changes[0].change_type, ChangeType::Modified);
518    }
519
520    #[test]
521    fn test_parent_not_deduped_when_no_child_changes() {
522        // Only the class-level content changes (e.g. a field added), no method changes
523        let class_before = SemanticEntity {
524            id: "a.ts::class::Foo".to_string(),
525            file_path: "a.ts".to_string(),
526            entity_type: "class".to_string(),
527            name: "Foo".to_string(),
528            parent_id: None,
529            content: "class Foo { bar() {} }".to_string(),
530            content_hash: content_hash("class Foo { bar() {} }"),
531            structural_hash: None,
532            start_line: 1,
533            end_line: 5,
534            metadata: None,
535        };
536        let method_before = SemanticEntity {
537            id: "a.ts::a.ts::class::Foo::bar".to_string(),
538            file_path: "a.ts".to_string(),
539            entity_type: "method".to_string(),
540            name: "bar".to_string(),
541            parent_id: Some("a.ts::class::Foo".to_string()),
542            content: "bar() {}".to_string(),
543            content_hash: content_hash("bar() {}"),
544            structural_hash: None,
545            start_line: 2,
546            end_line: 4,
547            metadata: None,
548        };
549
550        let class_after = SemanticEntity {
551            id: "a.ts::class::Foo".to_string(),
552            file_path: "a.ts".to_string(),
553            entity_type: "class".to_string(),
554            name: "Foo".to_string(),
555            parent_id: None,
556            content: "class Foo { x = 1; bar() {} }".to_string(),
557            content_hash: content_hash("class Foo { x = 1; bar() {} }"),
558            structural_hash: None,
559            start_line: 1,
560            end_line: 6,
561            metadata: None,
562        };
563        let method_after = SemanticEntity {
564            id: "a.ts::a.ts::class::Foo::bar".to_string(),
565            file_path: "a.ts".to_string(),
566            entity_type: "method".to_string(),
567            name: "bar".to_string(),
568            parent_id: Some("a.ts::class::Foo".to_string()),
569            content: "bar() {}".to_string(),
570            content_hash: content_hash("bar() {}"),
571            structural_hash: None,
572            start_line: 3,
573            end_line: 5,
574            metadata: None,
575        };
576
577        let before = vec![class_before, method_before];
578        let after = vec![class_after, method_after];
579        let result = match_entities(&before, &after, "a.ts", None, None, None);
580
581        // Class changed but method didn't, so class should still appear
582        assert_eq!(result.changes.len(), 1);
583        assert_eq!(result.changes[0].entity_name, "Foo");
584        assert_eq!(result.changes[0].change_type, ChangeType::Modified);
585    }
586
587    #[test]
588    fn test_default_similarity() {
589        let a = make_entity("a", "a", "the quick brown fox", "a.ts");
590        let b = make_entity("b", "b", "the quick brown dog", "a.ts");
591        let score = default_similarity(&a, &b);
592        assert!(score > 0.5);
593        assert!(score < 1.0);
594    }
595}