Skip to main content

sem_core/parser/
differ.rs

1use rayon::prelude::*;
2use serde::Serialize;
3
4use crate::git::types::FileChange;
5use crate::model::change::{ChangeType, SemanticChange};
6use crate::model::entity::SemanticEntity;
7use crate::model::identity::match_entities;
8use crate::parser::registry::ParserRegistry;
9use std::collections::{HashMap, HashSet};
10
11#[derive(Debug, Clone, Serialize)]
12#[serde(rename_all = "camelCase")]
13pub struct DiffResult {
14    pub changes: Vec<SemanticChange>,
15    pub file_count: usize,
16    pub added_count: usize,
17    pub modified_count: usize,
18    pub deleted_count: usize,
19    pub moved_count: usize,
20    pub renamed_count: usize,
21    pub reordered_count: usize,
22    pub orphan_count: usize,
23}
24
25pub fn compute_semantic_diff(
26    file_changes: &[FileChange],
27    registry: &ParserRegistry,
28    commit_sha: Option<&str>,
29    author: Option<&str>,
30) -> DiffResult {
31    // Process files in parallel: each file's entity extraction and matching is independent
32    let per_file_changes: Vec<(String, Vec<SemanticChange>)> = file_changes
33        .par_iter()
34        .filter_map(|file| {
35            let content_hint = file.after_content.as_deref()
36                .or(file.before_content.as_deref())
37                .unwrap_or("");
38            let resolved = registry.resolve_file_path(&file.file_path);
39            let detection_path = resolved.as_deref().unwrap_or(&file.file_path);
40            let plugin = registry.get_plugin_with_content(detection_path, content_hint)?;
41
42            let before_entities = if let Some(ref content) = file.before_content {
43                let before_path = file.old_file_path.as_deref().unwrap_or(&file.file_path);
44                let before_resolved = registry.resolve_file_path(before_path);
45                let before_detection = before_resolved.as_deref().unwrap_or(before_path);
46                match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
47                    plugin.extract_entities(content, before_detection)
48                })) {
49                    Ok(entities) => entities,
50                    Err(_) => Vec::new(),
51                }
52            } else {
53                Vec::new()
54            };
55
56            let after_entities = if let Some(ref content) = file.after_content {
57                match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
58                    plugin.extract_entities(content, detection_path)
59                })) {
60                    Ok(entities) => entities,
61                    Err(_) => Vec::new(),
62                }
63            } else {
64                Vec::new()
65            };
66
67            let sim_fn = |a: &crate::model::entity::SemanticEntity,
68                          b: &crate::model::entity::SemanticEntity|
69             -> f64 { plugin.compute_similarity(a, b) };
70
71            let mut result = match_entities(
72                &before_entities,
73                &after_entities,
74                &file.file_path,
75                Some(&sim_fn),
76                commit_sha,
77                author,
78            );
79
80            // Suppress parent entities whose modification is already explained
81            // by child entity changes (e.g. impl blocks when methods changed).
82            suppress_redundant_parents(&mut result.changes, &before_entities, &after_entities);
83
84            // Detect orphan changes (lines that changed outside any entity span).
85            let orphans = detect_orphan_changes(
86                file,
87                &before_entities,
88                &after_entities,
89                commit_sha,
90                author,
91            );
92            result.changes.extend(orphans);
93
94            result.changes.sort_by_key(|change| change.entity_line);
95
96            if result.changes.is_empty() {
97                None
98            } else {
99                Some((file.file_path.clone(), result.changes))
100            }
101        })
102        .collect();
103
104    let mut all_changes: Vec<SemanticChange> = Vec::new();
105    let mut files_with_changes: HashSet<String> = HashSet::new();
106    for (file_path, changes) in per_file_changes {
107        files_with_changes.insert(file_path);
108        all_changes.extend(changes);
109    }
110
111    // Single-pass counting (exclude orphan changes from entity counts)
112    let mut added_count = 0;
113    let mut modified_count = 0;
114    let mut deleted_count = 0;
115    let mut moved_count = 0;
116    let mut renamed_count = 0;
117    let mut reordered_count = 0;
118    let mut orphan_count = 0;
119
120    for c in &all_changes {
121        if c.entity_type == "orphan" {
122            orphan_count += 1;
123            continue;
124        }
125        match c.change_type {
126            ChangeType::Added => added_count += 1,
127            ChangeType::Modified => modified_count += 1,
128            ChangeType::Deleted => deleted_count += 1,
129            ChangeType::Moved => moved_count += 1,
130            ChangeType::Renamed => renamed_count += 1,
131            ChangeType::Reordered => reordered_count += 1,
132        }
133    }
134
135    DiffResult {
136        changes: all_changes,
137        file_count: files_with_changes.len(),
138        added_count,
139        modified_count,
140        deleted_count,
141        moved_count,
142        renamed_count,
143        reordered_count,
144        orphan_count,
145    }
146}
147
148fn suppress_redundant_parents(
149    changes: &mut Vec<SemanticChange>,
150    before: &[SemanticEntity],
151    after: &[SemanticEntity],
152) {
153    if changes.len() < 2 {
154        return;
155    }
156
157    const CONTAINER_TYPES: &[&str] = &[
158        "impl", "trait", "module", "class", "interface", "mixin",
159        "extension", "namespace", "export", "package",
160        "svelte_instance_script", "svelte_module_script",
161        "object",
162    ];
163
164    let before_by_id: HashMap<&str, &SemanticEntity> =
165        before.iter().map(|e| (e.id.as_str(), e)).collect();
166    let after_by_id: HashMap<&str, &SemanticEntity> =
167        after.iter().map(|e| (e.id.as_str(), e)).collect();
168
169    let mut before_children: HashMap<&str, Vec<&SemanticEntity>> = HashMap::new();
170    for e in before {
171        if let Some(ref pid) = e.parent_id {
172            before_children.entry(pid.as_str()).or_default().push(e);
173        }
174    }
175    let mut after_children: HashMap<&str, Vec<&SemanticEntity>> = HashMap::new();
176    for e in after {
177        if let Some(ref pid) = e.parent_id {
178            after_children.entry(pid.as_str()).or_default().push(e);
179        }
180    }
181
182    let changed_ids: HashSet<&str> = changes.iter().map(|c| c.entity_id.as_str()).collect();
183
184    let mut suppress: HashSet<String> = HashSet::new();
185    for change in changes.iter() {
186        if !matches!(change.change_type, ChangeType::Modified | ChangeType::Added | ChangeType::Deleted) {
187            continue;
188        }
189        if !CONTAINER_TYPES.contains(&change.entity_type.as_str()) {
190            continue;
191        }
192        let eid = change.entity_id.as_str();
193        let b_children = before_children.get(eid).map(|v| v.as_slice()).unwrap_or(&[]);
194        let a_children = after_children.get(eid).map(|v| v.as_slice()).unwrap_or(&[]);
195
196        let has_changed_child = b_children.iter().any(|c| changed_ids.contains(c.id.as_str()))
197            || a_children.iter().any(|c| changed_ids.contains(c.id.as_str()));
198        if !has_changed_child {
199            continue;
200        }
201
202        // Added/Deleted: suppress unconditionally; the children carry the detail.
203        // Modified: only suppress if the container's own declaration is unchanged
204        // and the value type didn't transition.
205        let should_suppress = if change.change_type == ChangeType::Modified {
206            match (before_by_id.get(eid), after_by_id.get(eid)) {
207                (Some(bp), Some(ap)) if bp.entity_type == ap.entity_type => {
208                    let before_own = strip_children_content(&bp.content, bp.start_line, b_children);
209                    let after_own = strip_children_content(&ap.content, ap.start_line, a_children);
210                    before_own == after_own
211                }
212                _ => false,
213            }
214        } else {
215            true
216        };
217
218        if should_suppress {
219            suppress.insert(change.entity_id.clone());
220        }
221    }
222
223    // Suppress an old parent that a Moved child left behind when the old
224    // parent itself appears as a change — handles the parent-rename case
225    // where the parent itself failed to match.
226    for change in changes.iter() {
227        if change.change_type == ChangeType::Moved {
228            if let Some(ref old_pid) = change.old_parent_id {
229                if changed_ids.contains(old_pid.as_str()) {
230                    suppress.insert(old_pid.clone());
231                }
232            }
233        }
234    }
235
236    if !suppress.is_empty() {
237        changes.retain(|c| !suppress.contains(&c.entity_id));
238    }
239
240    // Drop a Moved child whose key is unchanged and whose old parent matches
241    // a Renamed entity — the child only "moved" because the parent renamed.
242    let renamed_before_ids: HashSet<&str> = changes
243        .iter()
244        .filter(|c| c.change_type == ChangeType::Renamed)
245        .filter_map(|c| {
246            let old_name = c.old_entity_name.as_deref()?;
247            let after_entity = after_by_id.get(c.entity_id.as_str())?;
248            before.iter()
249                .find(|e| {
250                    e.name == old_name
251                        && e.entity_type == after_entity.entity_type
252                        && e.parent_id == after_entity.parent_id
253                })
254                .map(|e| e.id.as_str())
255        })
256        .collect();
257
258    if !renamed_before_ids.is_empty() {
259        changes.retain(|c| {
260            !(c.change_type == ChangeType::Moved
261                && c.old_entity_name.is_none()
262                && c.old_parent_id.as_deref()
263                    .map_or(false, |pid| renamed_before_ids.contains(pid)))
264        });
265    }
266}
267
268fn strip_children_content(content: &str, parent_start_line: usize, children: &[&SemanticEntity]) -> String {
269    let lines: Vec<&str> = content.lines().collect();
270    let mut excluded: HashSet<usize> = HashSet::new();
271    for child in children {
272        let start_idx = child.start_line.saturating_sub(parent_start_line);
273        let end_idx = child.end_line.saturating_sub(parent_start_line);
274        for i in start_idx..=end_idx.max(start_idx) {
275            if i < lines.len() {
276                excluded.insert(i);
277            }
278        }
279    }
280    lines.iter().enumerate()
281        .filter(|(i, _)| !excluded.contains(i))
282        .map(|(_, l)| l.trim())
283        .filter(|l| !l.is_empty())
284        .collect::<Vec<_>>()
285        .join(" ")
286}
287
288/// Detect changes in lines that fall outside any entity span.
289/// These are things like use statements, crate-level attributes, standalone
290/// comments, and macro invocations that aren't tracked as entities.
291fn detect_orphan_changes(
292    file: &FileChange,
293    before_entities: &[SemanticEntity],
294    after_entities: &[SemanticEntity],
295    commit_sha: Option<&str>,
296    author: Option<&str>,
297) -> Vec<SemanticChange> {
298    let before_text = file.before_content.as_deref().unwrap_or("");
299    let after_text = file.after_content.as_deref().unwrap_or("");
300
301    // Build covered line sets from entity spans
302    let before_covered: HashSet<usize> = before_entities
303        .iter()
304        .flat_map(|e| e.start_line..=e.end_line)
305        .collect();
306    let after_covered: HashSet<usize> = after_entities
307        .iter()
308        .flat_map(|e| e.start_line..=e.end_line)
309        .collect();
310
311    // Extract uncovered lines, preserving line numbers for context
312    let before_orphan: String = before_text
313        .lines()
314        .enumerate()
315        .filter(|(i, _)| !before_covered.contains(&(i + 1)))
316        .map(|(_, l)| l)
317        .collect::<Vec<_>>()
318        .join("\n");
319    let after_orphan: String = after_text
320        .lines()
321        .enumerate()
322        .filter(|(i, _)| !after_covered.contains(&(i + 1)))
323        .map(|(_, l)| l)
324        .collect::<Vec<_>>()
325        .join("\n");
326
327    // Skip if orphan content is unchanged
328    if before_orphan == after_orphan {
329        return Vec::new();
330    }
331
332    let change_type = if before_orphan.trim().is_empty() {
333        ChangeType::Added
334    } else if after_orphan.trim().is_empty() {
335        ChangeType::Deleted
336    } else {
337        ChangeType::Modified
338    };
339
340    vec![SemanticChange {
341        id: format!("{}::orphan", file.file_path),
342        entity_id: format!("{}::orphan", file.file_path),
343        change_type,
344        entity_type: "orphan".to_string(),
345        entity_name: "module-level".to_string(),
346        entity_line: 0,
347        parent_name: None,
348        file_path: file.file_path.clone(),
349        old_entity_name: None,
350        old_file_path: None,
351        old_parent_id: None,
352        before_content: if before_orphan.is_empty() {
353            None
354        } else {
355            Some(before_orphan)
356        },
357        after_content: if after_orphan.is_empty() {
358            None
359        } else {
360            Some(after_orphan)
361        },
362        commit_sha: commit_sha.map(String::from),
363        author: author.map(String::from),
364        timestamp: None,
365        structural_change: Some(true),
366    }]
367}
368
369#[cfg(test)]
370mod tests {
371    use super::*;
372    use crate::git::types::{FileChange, FileStatus};
373    use crate::parser::plugins::create_default_registry;
374
375    fn modified_file(path: &str, before: &str, after: &str) -> FileChange {
376        FileChange {
377            file_path: path.to_string(),
378            status: FileStatus::Modified,
379            old_file_path: None,
380            before_content: Some(before.to_string()),
381            after_content: Some(after.to_string()),
382        }
383    }
384
385    #[test]
386    fn test_parent_suppressed_when_only_child_modified() {
387        let before = "class UserService:\n    def get_user(self, user_id):\n        return db.find(user_id)\n";
388        let after  = "class UserService:\n    def get_user(self, user_id):\n        return db.find(user_id, include_deleted=False)\n";
389
390        let registry = create_default_registry();
391        let result = compute_semantic_diff(&[modified_file("svc.py", before, after)], &registry, None, None);
392
393        let names: Vec<&str> = result.changes.iter().map(|c| c.entity_name.as_str()).collect();
394        assert!(
395            result.changes.iter().any(|c| c.entity_name == "get_user"),
396            "expected method get_user in changes, got: {names:?}"
397        );
398        assert!(
399            !result.changes.iter().any(|c| c.entity_name == "UserService" && c.change_type == ChangeType::Modified),
400            "class should be suppressed when only the method body changed, got: {names:?}"
401        );
402    }
403
404    #[test]
405    fn test_parent_not_suppressed_when_own_declaration_changes() {
406        let before = "class UserService:\n    def get_user(self, user_id):\n        return db.find(user_id)\n";
407        let after  = "class UserService(BaseService):\n    def get_user(self, user_id):\n        return db.find(user_id, include_deleted=False)\n";
408
409        let registry = create_default_registry();
410        let result = compute_semantic_diff(&[modified_file("svc.py", before, after)], &registry, None, None);
411
412        let names: Vec<&str> = result.changes.iter().map(|c| c.entity_name.as_str()).collect();
413        assert!(
414            result.changes.iter().any(|c| c.entity_name == "get_user"),
415            "expected method get_user in changes, got: {names:?}"
416        );
417        assert!(
418            result.changes.iter().any(|c| c.entity_name == "UserService" && c.change_type == ChangeType::Modified),
419            "class should remain Modified when its own declaration changed, got: {names:?}"
420        );
421    }
422}