Skip to main content

semantic/analysis/
analysis_functions.rs

1// SPDX-License-Identifier: Apache-2.0
2//! Function-level semantic changes.
3
4use std::collections::{BTreeMap, BTreeSet, HashSet};
5
6use objects::object::{ChangeImportance, SemanticChange};
7
8use super::analysis_similarity::{SimilarityMethod, compute_similarity_with_language};
9use crate::parser::{FunctionDef, Language, ParsedFile};
10
11const FUNCTION_RENAME_SIMILARITY_THRESHOLD: f64 = 0.58;
12
13/// Multi-map: function name → all definitions on this side with that
14/// name, in source order.
15///
16/// `BTreeMap<String, FunctionDef>` would silently collapse same-name
17/// redeclarations (JS allows two `function foo()` at module scope;
18/// Python allows repeated top-level `def foo()`). A prior fix (r1)
19/// keyed entries by `(name, occurrence)` to stop the collapse, but
20/// that paired old's `foo[0]` with new's `foo[0]` regardless of body
21/// content — so a fresh same-name definition inserted before existing
22/// ones produced a bogus FunctionModified with the wrong delta plus a
23/// misclassified add/delete (Codex cid 3259311747, heddle#125 r2).
24///
25/// Keeping a `Vec` per name lets us pair instances across versions by
26/// *content similarity* (see `pair_within_name`) rather than by
27/// within-side position. merge_driver keeps the positional keying
28/// (commit 2198b00) because its job is line-up merging within a
29/// logical slot; analysis_functions needs fuzzy cross-version identity.
30type FunctionMap = BTreeMap<String, Vec<FunctionDef>>;
31
32/// `(name, within-side index)` — identifies a single function instance
33/// on one side. Not a cross-version identity.
34type InstanceRef = (String, usize);
35
36fn build_function_map(parsed: Option<&ParsedFile>) -> FunctionMap {
37    let Some(parsed) = parsed else {
38        return BTreeMap::new();
39    };
40    let mut map = FunctionMap::new();
41    for func in parsed.extract_functions() {
42        map.entry(func.name.clone()).or_default().push(func);
43    }
44    map
45}
46
47/// Greedy best-similarity matcher within a single name bucket. Returns
48/// (paired old/new indices, unpaired old indices, unpaired new indices).
49fn pair_within_name(
50    olds: &[FunctionDef],
51    news: &[FunctionDef],
52    similarity_method: SimilarityMethod,
53    language: Language,
54) -> (Vec<(usize, usize)>, Vec<usize>, Vec<usize>) {
55    let mut candidates: Vec<(usize, usize, f64)> = Vec::with_capacity(olds.len() * news.len());
56    for (i, o) in olds.iter().enumerate() {
57        for (j, n) in news.iter().enumerate() {
58            let similarity = if o.content == n.content {
59                1.0
60            } else {
61                compute_similarity_with_language(
62                    &o.content,
63                    &n.content,
64                    similarity_method,
65                    language,
66                )
67            };
68            candidates.push((i, j, similarity));
69        }
70    }
71    // Highest similarity first; deterministic tiebreak: lowest old, then lowest new.
72    candidates
73        .sort_by(|(li, lj, ls), (ri, rj, rs)| rs.total_cmp(ls).then(li.cmp(ri)).then(lj.cmp(rj)));
74
75    let mut old_used = vec![false; olds.len()];
76    let mut new_used = vec![false; news.len()];
77    let mut pairs = Vec::new();
78    for (i, j, _) in candidates {
79        if !old_used[i] && !new_used[j] {
80            old_used[i] = true;
81            new_used[j] = true;
82            pairs.push((i, j));
83        }
84    }
85    let unmatched_old = old_used
86        .iter()
87        .enumerate()
88        .filter_map(|(i, used)| (!*used).then_some(i))
89        .collect();
90    let unmatched_new = new_used
91        .iter()
92        .enumerate()
93        .filter_map(|(j, used)| (!*used).then_some(j))
94        .collect();
95    (pairs, unmatched_old, unmatched_new)
96}
97
98/// Detect function-level changes between two file versions.
99pub fn detect_function_changes(
100    old_path: &std::path::Path,
101    new_path: &std::path::Path,
102    old_content: &str,
103    new_content: &str,
104    similarity_method: SimilarityMethod,
105) -> Vec<SemanticChange> {
106    let old_parsed = ParsedFile::parse(old_content, Language::from_path(old_path));
107    let new_parsed = ParsedFile::parse(new_content, Language::from_path(new_path));
108
109    detect_function_changes_with_parsed(
110        old_path,
111        new_path,
112        old_parsed.as_ref(),
113        new_parsed.as_ref(),
114        similarity_method,
115    )
116}
117
118pub(crate) fn detect_function_changes_with_parsed(
119    old_path: &std::path::Path,
120    new_path: &std::path::Path,
121    old_parsed: Option<&ParsedFile>,
122    new_parsed: Option<&ParsedFile>,
123    similarity_method: SimilarityMethod,
124) -> Vec<SemanticChange> {
125    let mut changes = Vec::new();
126    let mut file_modified = false;
127
128    let old_funcs = build_function_map(old_parsed);
129    let new_funcs = build_function_map(new_parsed);
130    let language = Language::from_path(new_path);
131
132    // Phase 1: pair instances within each name bucket by content similarity.
133    let mut pairs: Vec<(String, usize, usize)> = Vec::new();
134    let mut unmatched_old: Vec<InstanceRef> = Vec::new();
135    let mut unmatched_new: Vec<InstanceRef> = Vec::new();
136
137    let mut all_names: BTreeSet<&str> = BTreeSet::new();
138    all_names.extend(old_funcs.keys().map(String::as_str));
139    all_names.extend(new_funcs.keys().map(String::as_str));
140
141    let empty: Vec<FunctionDef> = Vec::new();
142    for name in &all_names {
143        let olds = old_funcs.get(*name).unwrap_or(&empty);
144        let news = new_funcs.get(*name).unwrap_or(&empty);
145        let (within, u_old, u_new) = pair_within_name(olds, news, similarity_method, language);
146        for (oi, ni) in within {
147            pairs.push(((*name).to_string(), oi, ni));
148        }
149        unmatched_old.extend(u_old.into_iter().map(|i| ((*name).to_string(), i)));
150        unmatched_new.extend(u_new.into_iter().map(|i| ((*name).to_string(), i)));
151    }
152    pairs.sort_by(|a, b| a.0.cmp(&b.0).then_with(|| a.2.cmp(&b.2)));
153
154    let moved_function_names = stable_order_moved_names(&old_funcs, &new_funcs, &pairs);
155
156    // Phase 2: cross-name rename detection over leftovers.
157    let mut consumed_old: HashSet<InstanceRef> = HashSet::new();
158    for (new_name, ni) in &unmatched_new {
159        let new_func = &new_funcs[new_name][*ni];
160        let renamed_from = unmatched_old
161            .iter()
162            .filter(|(on, oi)| !consumed_old.contains(&(on.clone(), *oi)))
163            .filter(|(on, _)| on != new_name)
164            .filter_map(|(on, oi)| {
165                let old_func = &old_funcs[on][*oi];
166                let similarity = compute_similarity_with_language(
167                    &normalized_function_for_matching(&old_func.content, on),
168                    &normalized_function_for_matching(&new_func.content, new_name),
169                    similarity_method,
170                    language,
171                );
172                let same_location_update = old_path == new_path
173                    && old_func.start_line.abs_diff(new_func.start_line) <= 5
174                    && similarity >= 0.30;
175                (similarity >= FUNCTION_RENAME_SIMILARITY_THRESHOLD || same_location_update)
176                    .then_some(((on.clone(), *oi), similarity))
177            })
178            .max_by(
179                |(left_key, left_similarity), (right_key, right_similarity)| {
180                    left_similarity
181                        .total_cmp(right_similarity)
182                        .then_with(|| right_key.cmp(left_key))
183                },
184            )
185            .map(|(key, _)| key);
186
187        if let Some((old_name, old_idx)) = renamed_from {
188            consumed_old.insert((old_name.clone(), old_idx));
189            changes.push(SemanticChange::FunctionRenamed {
190                file: new_path.to_path_buf(),
191                old_name,
192                new_name: new_name.clone(),
193                importance: Some(ChangeImportance::Low),
194            });
195            file_modified = true;
196        } else {
197            let source = extraction_source(&old_funcs, new_func);
198            if let Some(source_name) = source {
199                changes.push(SemanticChange::FunctionExtracted {
200                    file: new_path.to_path_buf(),
201                    name: new_name.clone(),
202                    source_file: Some(old_path.to_path_buf()),
203                    source_name: Some(source_name),
204                    importance: Some(ChangeImportance::High),
205                });
206            } else {
207                changes.push(SemanticChange::FunctionAdded {
208                    file: new_path.to_path_buf(),
209                    name: new_name.clone(),
210                    importance: Some(ChangeImportance::High),
211                });
212            }
213            file_modified = true;
214        }
215    }
216
217    // Phase 3: unconsumed unmatched_old → deletions.
218    for (old_name, oi) in &unmatched_old {
219        if consumed_old.contains(&(old_name.clone(), *oi)) {
220            continue;
221        }
222        changes.push(SemanticChange::FunctionDeleted {
223            file: new_path.to_path_buf(),
224            name: old_name.clone(),
225            importance: Some(ChangeImportance::High),
226        });
227        file_modified = true;
228    }
229
230    // Phase 4: paired instances → signature / move / modified.
231    for (name, oi, ni) in &pairs {
232        let old_func = &old_funcs[name][*oi];
233        let new_func = &new_funcs[name][*ni];
234        if old_func.signature != new_func.signature {
235            changes.push(SemanticChange::SignatureChanged {
236                file: new_path.to_path_buf(),
237                name: name.clone(),
238                old_signature: old_func.signature.clone(),
239                new_signature: new_func.signature.clone(),
240                importance: Some(ChangeImportance::Medium),
241            });
242            file_modified = true;
243        } else if old_path == new_path
244            && old_func.content == new_func.content
245            && old_func.start_line != new_func.start_line
246            && moved_function_names.contains(name)
247        {
248            changes.push(SemanticChange::FunctionMoved {
249                file: new_path.to_path_buf(),
250                name: name.clone(),
251                old_start_line: old_func.start_line,
252                new_start_line: new_func.start_line,
253                importance: Some(ChangeImportance::Low),
254            });
255            file_modified = true;
256        } else if old_func.content != new_func.content {
257            changes.push(SemanticChange::FunctionModified {
258                file: new_path.to_path_buf(),
259                name: name.clone(),
260                importance: Some(ChangeImportance::Medium),
261            });
262            file_modified = true;
263        }
264    }
265
266    if file_modified {
267        changes.push(SemanticChange::FileModified {
268            path: new_path.to_path_buf(),
269            classification: None,
270            importance: None,
271            confidence: None,
272        });
273    }
274
275    changes
276}
277
278fn extraction_source(old_funcs: &FunctionMap, extracted: &FunctionDef) -> Option<String> {
279    let extracted_lines = meaningful_body_lines(&extracted.content);
280    if extracted_lines.is_empty() {
281        return None;
282    }
283
284    old_funcs
285        .iter()
286        .flat_map(|(name, funcs)| funcs.iter().map(move |func| (name.clone(), func)))
287        .filter_map(|(name, old_func)| {
288            let old_lines = meaningful_body_lines(&old_func.content);
289            let evidence = extraction_evidence(&old_lines, &extracted_lines);
290            evidence.is_strong().then_some((name, evidence))
291        })
292        .max_by(|left, right| {
293            left.1
294                .score
295                .total_cmp(&right.1.score)
296                .then_with(|| left.1.matched.cmp(&right.1.matched))
297                .then_with(|| right.0.cmp(&left.0))
298        })
299        .map(|(name, _)| name)
300}
301
302#[derive(Debug)]
303struct ExtractionEvidence {
304    matched: usize,
305    score: f64,
306    exact_matches: usize,
307    longest_exact_expression_len: usize,
308    extracted_lines: usize,
309}
310
311impl ExtractionEvidence {
312    fn is_strong(&self) -> bool {
313        if self.extracted_lines == 0 {
314            return false;
315        }
316
317        let coverage = self.matched as f64 / self.extracted_lines as f64;
318        let weighted_coverage = self.score / self.extracted_lines as f64;
319
320        if self.extracted_lines == 1 {
321            return self.exact_matches == 1
322                && weighted_coverage >= 0.95
323                && self.longest_exact_expression_len >= 20;
324        }
325
326        coverage >= 0.70 && weighted_coverage >= 0.70
327    }
328}
329
330fn extraction_evidence(old_lines: &[String], extracted_lines: &[String]) -> ExtractionEvidence {
331    let mut matched = 0;
332    let mut score = 0.0;
333    let mut exact_matches = 0;
334    let mut longest_exact_expression_len = 0;
335
336    for line in extracted_lines {
337        let best = old_lines
338            .iter()
339            .map(|old_line| body_line_match(old_line, line))
340            .max_by(|left, right| left.score.total_cmp(&right.score))
341            .unwrap_or_default();
342        if best.score > 0.0 {
343            matched += 1;
344            score += best.score;
345        }
346        if best.score >= 1.0 {
347            exact_matches += 1;
348            longest_exact_expression_len = longest_exact_expression_len.max(best.expression_len);
349        }
350    }
351
352    ExtractionEvidence {
353        matched,
354        score,
355        exact_matches,
356        longest_exact_expression_len,
357        extracted_lines: extracted_lines.len(),
358    }
359}
360
361#[derive(Clone, Copy, Debug, Default)]
362struct BodyLineMatch {
363    score: f64,
364    expression_len: usize,
365}
366
367fn body_line_match(old_line: &str, extracted_line: &str) -> BodyLineMatch {
368    let old = comparable_body_expression(old_line);
369    let extracted = comparable_body_expression(extracted_line);
370    if old == extracted {
371        return BodyLineMatch {
372            score: 1.0,
373            expression_len: extracted.len(),
374        };
375    }
376    if extracted.len() >= 24 && old.contains(&extracted) {
377        return BodyLineMatch {
378            score: 0.75,
379            expression_len: extracted.len(),
380        };
381    }
382    if old.len() >= 24 && extracted.contains(&old) {
383        return BodyLineMatch {
384            score: 0.75,
385            expression_len: old.len(),
386        };
387    }
388    BodyLineMatch::default()
389}
390
391fn comparable_body_expression(line: &str) -> String {
392    let trimmed = line
393        .trim()
394        .trim_end_matches(';')
395        .trim_start_matches("return ")
396        .trim();
397    let expression = trimmed
398        .split_once('=')
399        .map(|(_, rhs)| rhs.trim())
400        .unwrap_or(trimmed);
401    expression.trim_end_matches(';').trim().to_string()
402}
403
404fn meaningful_body_lines(content: &str) -> Vec<String> {
405    content
406        .lines()
407        .map(str::trim)
408        .filter(|line| {
409            !line.is_empty()
410                && *line != "{"
411                && *line != "}"
412                && !line.starts_with("fn ")
413                && !line.starts_with("pub fn ")
414                && !line.starts_with("async fn ")
415                && !line.starts_with("pub async fn ")
416        })
417        .map(ToString::to_string)
418        .collect()
419}
420
421fn stable_order_moved_names(
422    old_funcs: &FunctionMap,
423    new_funcs: &FunctionMap,
424    pairs: &[(String, usize, usize)],
425) -> HashSet<String> {
426    let mut old_order: Vec<(usize, String)> = Vec::new();
427    let mut new_order: Vec<(usize, String)> = Vec::new();
428    for (name, oi, ni) in pairs {
429        let old_func = &old_funcs[name][*oi];
430        let new_func = &new_funcs[name][*ni];
431        if old_func.content == new_func.content {
432            old_order.push((old_func.start_line, name.clone()));
433            new_order.push((new_func.start_line, name.clone()));
434        }
435    }
436    old_order.sort_by(|left, right| left.0.cmp(&right.0).then_with(|| left.1.cmp(&right.1)));
437    new_order.sort_by(|left, right| left.0.cmp(&right.0).then_with(|| left.1.cmp(&right.1)));
438
439    let old_names: Vec<String> = old_order.into_iter().map(|(_, n)| n).collect();
440    let new_names: Vec<String> = new_order.into_iter().map(|(_, n)| n).collect();
441    if old_names == new_names {
442        return HashSet::new();
443    }
444    old_names
445        .into_iter()
446        .zip(new_names)
447        .filter_map(|(old_name, new_name)| (old_name != new_name).then_some([old_name, new_name]))
448        .flatten()
449        .collect()
450}
451
452fn normalized_function_for_matching(content: &str, name: &str) -> String {
453    content
454        .replace(name, "__function_name__")
455        .lines()
456        .map(str::trim)
457        .filter(|line| !line.is_empty())
458        .collect::<Vec<_>>()
459        .join("\n")
460}