Skip to main content

sem_core/parser/
differ.rs

1#[cfg(feature = "parallel")]
2use rayon::prelude::*;
3use serde::Serialize;
4
5use crate::git::types::FileChange;
6
7macro_rules! maybe_par_iter {
8    ($slice:expr) => {{
9        #[cfg(feature = "parallel")]
10        {
11            $slice.par_iter()
12        }
13        #[cfg(not(feature = "parallel"))]
14        {
15            $slice.iter()
16        }
17    }};
18}
19use crate::model::change::{ChangeType, SemanticChange};
20use crate::model::entity::SemanticEntity;
21use crate::model::identity::match_entities;
22use crate::parser::registry::ParserRegistry;
23use std::collections::{HashMap, HashSet};
24
25#[derive(Debug, Clone, Serialize)]
26#[serde(rename_all = "camelCase")]
27pub struct DiffResult {
28    pub changes: Vec<SemanticChange>,
29    pub file_count: usize,
30    pub added_count: usize,
31    pub modified_count: usize,
32    pub deleted_count: usize,
33    pub moved_count: usize,
34    pub renamed_count: usize,
35    pub reordered_count: usize,
36    pub orphan_count: usize,
37    pub total_entities_before: usize,
38    pub total_entities_after: usize,
39}
40
41pub fn compute_semantic_diff(
42    file_changes: &[FileChange],
43    registry: &ParserRegistry,
44    commit_sha: Option<&str>,
45    author: Option<&str>,
46) -> DiffResult {
47    // Process files in parallel: each file's entity extraction and matching is independent
48    let per_file_changes: Vec<(String, Vec<SemanticChange>, usize, usize)> =
49        maybe_par_iter!(file_changes)
50            .filter_map(|file| {
51                let content_hint = file
52                    .after_content
53                    .as_deref()
54                    .or(file.before_content.as_deref())
55                    .unwrap_or("");
56                let resolved = registry.resolve_file_path(&file.file_path);
57                let detection_path = resolved.as_deref().unwrap_or(&file.file_path);
58                let plugin = registry.get_plugin_with_content(detection_path, content_hint)?;
59
60                let before_entities = if let Some(ref content) = file.before_content {
61                    let before_path = file.old_file_path.as_deref().unwrap_or(&file.file_path);
62                    let before_resolved = registry.resolve_file_path(before_path);
63                    let before_detection = before_resolved.as_deref().unwrap_or(before_path);
64                    match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
65                        plugin.extract_entities(content, before_detection)
66                    })) {
67                        Ok(entities) => entities,
68                        Err(_) => Vec::new(),
69                    }
70                } else {
71                    Vec::new()
72                };
73
74                let after_entities = if let Some(ref content) = file.after_content {
75                    match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
76                        plugin.extract_entities(content, detection_path)
77                    })) {
78                        Ok(entities) => entities,
79                        Err(_) => Vec::new(),
80                    }
81                } else {
82                    Vec::new()
83                };
84
85                let before_count = before_entities.len();
86                let after_count = after_entities.len();
87
88                let sim_fn = |a: &crate::model::entity::SemanticEntity,
89                              b: &crate::model::entity::SemanticEntity|
90                 -> f64 { plugin.compute_similarity(a, b) };
91
92                let mut result = match_entities(
93                    &before_entities,
94                    &after_entities,
95                    &file.file_path,
96                    Some(&sim_fn),
97                    commit_sha,
98                    author,
99                );
100
101                // Suppress parent entities whose modification is already explained
102                // by child entity changes (e.g. impl blocks when methods changed).
103                suppress_redundant_parents(&mut result.changes, &before_entities, &after_entities);
104
105                // Detect orphan changes (lines that changed outside any entity span).
106                let orphans = detect_orphan_changes(
107                    file,
108                    &before_entities,
109                    &after_entities,
110                    commit_sha,
111                    author,
112                );
113                result.changes.extend(orphans);
114
115                result.changes.sort_by_key(|change| change.entity_line);
116
117                if result.changes.is_empty() {
118                    None
119                } else {
120                    Some((
121                        file.file_path.clone(),
122                        result.changes,
123                        before_count,
124                        after_count,
125                    ))
126                }
127            })
128            .collect();
129
130    let mut all_changes: Vec<SemanticChange> = Vec::new();
131    let mut files_with_changes: HashSet<String> = HashSet::new();
132    let mut total_entities_before: usize = 0;
133    let mut total_entities_after: usize = 0;
134    for (file_path, changes, before_count, after_count) in per_file_changes {
135        files_with_changes.insert(file_path);
136        all_changes.extend(changes);
137        total_entities_before += before_count;
138        total_entities_after += after_count;
139    }
140
141    // Single-pass counting (exclude orphan changes from entity counts)
142    let mut added_count = 0;
143    let mut modified_count = 0;
144    let mut deleted_count = 0;
145    let mut moved_count = 0;
146    let mut renamed_count = 0;
147    let mut reordered_count = 0;
148    let mut orphan_count = 0;
149
150    for c in &all_changes {
151        if c.entity_type == "orphan" {
152            orphan_count += 1;
153            continue;
154        }
155        match c.change_type {
156            ChangeType::Added => added_count += 1,
157            ChangeType::Modified => modified_count += 1,
158            ChangeType::Deleted => deleted_count += 1,
159            ChangeType::Moved => moved_count += 1,
160            ChangeType::Renamed => renamed_count += 1,
161            ChangeType::Reordered => reordered_count += 1,
162        }
163    }
164
165    DiffResult {
166        changes: all_changes,
167        file_count: files_with_changes.len(),
168        added_count,
169        modified_count,
170        deleted_count,
171        moved_count,
172        renamed_count,
173        reordered_count,
174        orphan_count,
175        total_entities_before,
176        total_entities_after,
177    }
178}
179
180fn suppress_redundant_parents(
181    changes: &mut Vec<SemanticChange>,
182    before: &[SemanticEntity],
183    after: &[SemanticEntity],
184) {
185    if changes.len() < 2 {
186        return;
187    }
188
189    const CONTAINER_TYPES: &[&str] = &[
190        "impl",
191        "trait",
192        "module",
193        "class",
194        "interface",
195        "mixin",
196        "extension",
197        "namespace",
198        "export",
199        "package",
200        "svelte_instance_script",
201        "svelte_module_script",
202        "object",
203    ];
204
205    let before_by_id: HashMap<&str, &SemanticEntity> =
206        before.iter().map(|e| (e.id.as_str(), e)).collect();
207    let after_by_id: HashMap<&str, &SemanticEntity> =
208        after.iter().map(|e| (e.id.as_str(), e)).collect();
209
210    let mut before_children: HashMap<&str, Vec<&SemanticEntity>> = HashMap::new();
211    for e in before {
212        if let Some(ref pid) = e.parent_id {
213            before_children.entry(pid.as_str()).or_default().push(e);
214        }
215    }
216    let mut after_children: HashMap<&str, Vec<&SemanticEntity>> = HashMap::new();
217    for e in after {
218        if let Some(ref pid) = e.parent_id {
219            after_children.entry(pid.as_str()).or_default().push(e);
220        }
221    }
222
223    let changed_ids: HashSet<&str> = changes.iter().map(|c| c.entity_id.as_str()).collect();
224
225    let mut suppress: HashSet<String> = HashSet::new();
226    for change in changes.iter() {
227        if !matches!(
228            change.change_type,
229            ChangeType::Modified | ChangeType::Added | ChangeType::Deleted
230        ) {
231            continue;
232        }
233        if !CONTAINER_TYPES.contains(&change.entity_type.as_str()) {
234            continue;
235        }
236        let eid = change.entity_id.as_str();
237        let b_children = before_children
238            .get(eid)
239            .map(|v| v.as_slice())
240            .unwrap_or(&[]);
241        let a_children = after_children.get(eid).map(|v| v.as_slice()).unwrap_or(&[]);
242
243        let has_changed_child = b_children
244            .iter()
245            .any(|c| changed_ids.contains(c.id.as_str()))
246            || a_children
247                .iter()
248                .any(|c| changed_ids.contains(c.id.as_str()));
249        if !has_changed_child {
250            continue;
251        }
252
253        // Added/Deleted: suppress unconditionally; the children carry the detail.
254        // Modified: only suppress if the container's own declaration is unchanged
255        // and the value type didn't transition.
256        let should_suppress = if change.change_type == ChangeType::Modified {
257            match (before_by_id.get(eid), after_by_id.get(eid)) {
258                (Some(bp), Some(ap)) if bp.entity_type == ap.entity_type => {
259                    let before_own = strip_children_content(&bp.content, bp.start_line, b_children);
260                    let after_own = strip_children_content(&ap.content, ap.start_line, a_children);
261                    before_own == after_own
262                }
263                _ => false,
264            }
265        } else {
266            true
267        };
268
269        if should_suppress {
270            suppress.insert(change.entity_id.clone());
271        }
272    }
273
274    // Suppress an old parent that a Moved child left behind when the old
275    // parent itself appears as a change — handles the parent-rename case
276    // where the parent itself failed to match.
277    for change in changes.iter() {
278        if change.change_type == ChangeType::Moved {
279            if let Some(ref old_pid) = change.old_parent_id {
280                if changed_ids.contains(old_pid.as_str()) {
281                    suppress.insert(old_pid.clone());
282                }
283            }
284        }
285    }
286
287    if !suppress.is_empty() {
288        changes.retain(|c| !suppress.contains(&c.entity_id));
289    }
290
291    // Drop a Moved child whose key is unchanged and whose old parent matches
292    // a Renamed entity — the child only "moved" because the parent renamed.
293    let renamed_before_ids: HashSet<&str> = changes
294        .iter()
295        .filter(|c| c.change_type == ChangeType::Renamed)
296        .filter_map(|c| {
297            let old_name = c.old_entity_name.as_deref()?;
298            let after_entity = after_by_id.get(c.entity_id.as_str())?;
299            before
300                .iter()
301                .find(|e| {
302                    e.name == old_name
303                        && e.entity_type == after_entity.entity_type
304                        && e.parent_id == after_entity.parent_id
305                })
306                .map(|e| e.id.as_str())
307        })
308        .collect();
309
310    if !renamed_before_ids.is_empty() {
311        changes.retain(|c| {
312            !(c.change_type == ChangeType::Moved
313                && c.old_entity_name.is_none()
314                && c.old_parent_id
315                    .as_deref()
316                    .map_or(false, |pid| renamed_before_ids.contains(pid)))
317        });
318    }
319}
320
321fn strip_children_content(
322    content: &str,
323    parent_start_line: usize,
324    children: &[&SemanticEntity],
325) -> String {
326    let lines: Vec<&str> = content.lines().collect();
327    let mut excluded: HashSet<usize> = HashSet::new();
328    for child in children {
329        let start_idx = child.start_line.saturating_sub(parent_start_line);
330        let end_idx = child.end_line.saturating_sub(parent_start_line);
331        for i in start_idx..=end_idx.max(start_idx) {
332            if i < lines.len() {
333                excluded.insert(i);
334            }
335        }
336    }
337    lines
338        .iter()
339        .enumerate()
340        .filter(|(i, _)| !excluded.contains(i))
341        .map(|(_, l)| l.trim())
342        .filter(|l| !l.is_empty())
343        .collect::<Vec<_>>()
344        .join(" ")
345}
346
347/// Detect changes in lines that fall outside any entity span.
348/// These are things like use statements, crate-level attributes, standalone
349/// comments, and macro invocations that aren't tracked as entities.
350fn detect_orphan_changes(
351    file: &FileChange,
352    before_entities: &[SemanticEntity],
353    after_entities: &[SemanticEntity],
354    commit_sha: Option<&str>,
355    author: Option<&str>,
356) -> Vec<SemanticChange> {
357    let before_text = file.before_content.as_deref().unwrap_or("");
358    let after_text = file.after_content.as_deref().unwrap_or("");
359
360    // Build covered line sets from entity spans
361    let before_covered: HashSet<usize> = before_entities
362        .iter()
363        .flat_map(|e| e.start_line..=e.end_line)
364        .collect();
365    let after_covered: HashSet<usize> = after_entities
366        .iter()
367        .flat_map(|e| e.start_line..=e.end_line)
368        .collect();
369
370    // Extract uncovered lines, preserving line numbers for context
371    let before_orphan: String = before_text
372        .lines()
373        .enumerate()
374        .filter(|(i, _)| !before_covered.contains(&(i + 1)))
375        .map(|(_, l)| l)
376        .collect::<Vec<_>>()
377        .join("\n");
378    let after_orphan: String = after_text
379        .lines()
380        .enumerate()
381        .filter(|(i, _)| !after_covered.contains(&(i + 1)))
382        .map(|(_, l)| l)
383        .collect::<Vec<_>>()
384        .join("\n");
385
386    // Skip if orphan content is unchanged
387    if before_orphan == after_orphan {
388        return Vec::new();
389    }
390
391    let change_type = if before_orphan.trim().is_empty() {
392        ChangeType::Added
393    } else if after_orphan.trim().is_empty() {
394        ChangeType::Deleted
395    } else {
396        ChangeType::Modified
397    };
398
399    vec![SemanticChange {
400        id: format!("{}::orphan", file.file_path),
401        entity_id: format!("{}::orphan", file.file_path),
402        change_type,
403        entity_type: "orphan".to_string(),
404        entity_name: "module-level".to_string(),
405        entity_line: 0,
406        parent_name: None,
407        file_path: file.file_path.clone(),
408        old_entity_name: None,
409        old_file_path: None,
410        old_parent_id: None,
411        before_content: if before_orphan.is_empty() {
412            None
413        } else {
414            Some(before_orphan)
415        },
416        after_content: if after_orphan.is_empty() {
417            None
418        } else {
419            Some(after_orphan)
420        },
421        commit_sha: commit_sha.map(String::from),
422        author: author.map(String::from),
423        timestamp: None,
424        structural_change: Some(true),
425    }]
426}
427
428#[cfg(test)]
429mod tests {
430    use super::*;
431    use crate::git::types::{FileChange, FileStatus};
432    use crate::parser::plugins::create_default_registry;
433
434    fn modified_file(path: &str, before: &str, after: &str) -> FileChange {
435        FileChange {
436            file_path: path.to_string(),
437            status: FileStatus::Modified,
438            old_file_path: None,
439            before_content: Some(before.to_string()),
440            after_content: Some(after.to_string()),
441        }
442    }
443
444    fn renamed_file(old_path: &str, new_path: &str, before: &str, after: &str) -> FileChange {
445        FileChange {
446            file_path: new_path.to_string(),
447            status: FileStatus::Renamed,
448            old_file_path: Some(old_path.to_string()),
449            before_content: Some(before.to_string()),
450            after_content: Some(after.to_string()),
451        }
452    }
453
454    #[test]
455    fn test_parent_suppressed_when_only_child_modified() {
456        let before = "class UserService:\n    def get_user(self, user_id):\n        return db.find(user_id)\n";
457        let after  = "class UserService:\n    def get_user(self, user_id):\n        return db.find(user_id, include_deleted=False)\n";
458
459        let registry = create_default_registry();
460        let result = compute_semantic_diff(
461            &[modified_file("svc.py", before, after)],
462            &registry,
463            None,
464            None,
465        );
466
467        let names: Vec<&str> = result
468            .changes
469            .iter()
470            .map(|c| c.entity_name.as_str())
471            .collect();
472        assert!(
473            result.changes.iter().any(|c| c.entity_name == "get_user"),
474            "expected method get_user in changes, got: {names:?}"
475        );
476        assert!(
477            !result
478                .changes
479                .iter()
480                .any(|c| c.entity_name == "UserService" && c.change_type == ChangeType::Modified),
481            "class should be suppressed when only the method body changed, got: {names:?}"
482        );
483    }
484
485    #[test]
486    fn test_parent_not_suppressed_when_own_declaration_changes() {
487        let before = "class UserService:\n    def get_user(self, user_id):\n        return db.find(user_id)\n";
488        let after  = "class UserService(BaseService):\n    def get_user(self, user_id):\n        return db.find(user_id, include_deleted=False)\n";
489
490        let registry = create_default_registry();
491        let result = compute_semantic_diff(
492            &[modified_file("svc.py", before, after)],
493            &registry,
494            None,
495            None,
496        );
497
498        let names: Vec<&str> = result
499            .changes
500            .iter()
501            .map(|c| c.entity_name.as_str())
502            .collect();
503        assert!(
504            result.changes.iter().any(|c| c.entity_name == "get_user"),
505            "expected method get_user in changes, got: {names:?}"
506        );
507        assert!(
508            result
509                .changes
510                .iter()
511                .any(|c| c.entity_name == "UserService" && c.change_type == ChangeType::Modified),
512            "class should remain Modified when its own declaration changed, got: {names:?}"
513        );
514    }
515
516    #[test]
517    fn renamed_file_with_edited_entity_reports_move_not_add_delete() {
518        let before = "def foo():\n    return alpha + beta + gamma\n";
519        let after = "def foo():\n    return one + two + three\n";
520
521        let registry = create_default_registry();
522        let result = compute_semantic_diff(
523            &[renamed_file("old.py", "new.py", before, after)],
524            &registry,
525            None,
526            None,
527        );
528
529        assert_eq!(result.added_count, 0);
530        assert_eq!(result.deleted_count, 0);
531        assert_eq!(result.moved_count, 1);
532        assert_eq!(result.changes.len(), 1);
533        assert_eq!(result.changes[0].entity_name, "foo");
534        assert_eq!(result.changes[0].old_file_path.as_deref(), Some("old.py"));
535    }
536}