Skip to main content

similarity_core/
overlap_detector.rs

1use crate::{
2    function_extractor::{extract_functions, FunctionDefinition},
3    parser::parse_and_convert_to_tree,
4    subtree_fingerprint::{
5        detect_partial_overlaps, generate_subtree_fingerprints, IndexedFunction, OverlapOptions,
6        PartialOverlap,
7    },
8    tsed::{calculate_tsed, TSEDOptions},
9};
10use std::collections::HashMap;
11
12/// Detect overlapping code fragments between functions
13pub fn find_function_overlaps(
14    source_code: &str,
15    target_code: &str,
16    options: &OverlapOptions,
17) -> Result<Vec<PartialOverlap>, anyhow::Error> {
18    // Extract functions from both files
19    let source_functions = match extract_functions("source.ts", source_code) {
20        Ok(funcs) => funcs,
21        Err(e) if e.contains("Parse errors:") => {
22            // Skip files with parse errors silently
23            return Ok(Vec::new());
24        }
25        Err(e) => return Err(anyhow::anyhow!(e)),
26    };
27
28    let target_functions = match extract_functions("target.ts", target_code) {
29        Ok(funcs) => funcs,
30        Err(e) if e.contains("Parse errors:") => {
31            // Skip files with parse errors silently
32            return Ok(Vec::new());
33        }
34        Err(e) => return Err(anyhow::anyhow!(e)),
35    };
36
37    // Parse and index functions
38    let mut all_overlaps = Vec::new();
39
40    for source_func in &source_functions {
41        let source_indexed = index_function(source_func, source_code, "source.ts")?;
42
43        for target_func in &target_functions {
44            // Skip if comparing the same function in the same file
45            // (but allow comparing functions with same name in different files)
46            if source_func.name == target_func.name && source_code == target_code {
47                continue;
48            }
49
50            let target_indexed = index_function(target_func, target_code, "target.ts")?;
51
52            // Debug output
53            #[cfg(test)]
54            {
55                eprintln!("Comparing {} vs {}", source_func.name, target_func.name);
56                eprintln!("Source subtrees: {}", source_indexed.subtree_index.len());
57                eprintln!("Target subtrees: {}", target_indexed.subtree_index.len());
58            }
59
60            // Detect overlaps
61            let overlaps = detect_partial_overlaps(&source_indexed, &target_indexed, options);
62            all_overlaps.extend(overlaps);
63        }
64    }
65
66    Ok(all_overlaps)
67}
68
69/// Detect overlaps across multiple files
70pub fn find_overlaps_across_files(
71    file_contents: &HashMap<String, String>,
72    options: &OverlapOptions,
73) -> Result<Vec<PartialOverlapWithFiles>, anyhow::Error> {
74    let mut all_overlaps = Vec::new();
75    let files: Vec<_> = file_contents.keys().collect();
76
77    // Compare each pair of files (including same file)
78    for i in 0..files.len() {
79        for j in i..files.len() {
80            // Start from i to include same-file comparisons
81            let source_file = files[i];
82            let target_file = files[j];
83            let source_code = &file_contents[source_file];
84            let target_code = &file_contents[target_file];
85
86            // Find overlaps between these files
87            let overlaps = find_function_overlaps(source_code, target_code, options)?;
88
89            // Add file information to overlaps
90            for overlap in overlaps {
91                all_overlaps.push(PartialOverlapWithFiles {
92                    source_file: source_file.clone(),
93                    target_file: target_file.clone(),
94                    overlap,
95                });
96            }
97        }
98    }
99
100    Ok(all_overlaps)
101}
102
103/// Overlap result with file information
104#[derive(Debug, Clone)]
105pub struct PartialOverlapWithFiles {
106    pub source_file: String,
107    pub target_file: String,
108    pub overlap: PartialOverlap,
109}
110
111/// Index a function for overlap detection
112fn index_function(
113    func: &FunctionDefinition,
114    full_code: &str,
115    file_name: &str,
116) -> Result<IndexedFunction, anyhow::Error> {
117    // Extract the entire function to parse it properly
118    // We need to find the function boundaries more accurately
119    let lines: Vec<&str> = full_code.lines().collect();
120    let start_line = (func.start_line as usize).saturating_sub(1);
121    let end_line = func.end_line as usize;
122
123    if start_line >= lines.len() || end_line > lines.len() {
124        return Err(anyhow::anyhow!("Function line numbers out of bounds"));
125    }
126
127    let func_code = lines[start_line..end_line].join("\n");
128
129    #[cfg(test)]
130    {
131        eprintln!("Indexing function {}", func.name);
132        eprintln!("Lines: {} - {}", func.start_line, func.end_line);
133        eprintln!("Code length: {}", func_code.len());
134        eprintln!("First 100 chars: {}", &func_code.chars().take(100).collect::<String>());
135    }
136
137    // Parse the function
138    let tree = parse_and_convert_to_tree(file_name, &func_code).map_err(|e| anyhow::anyhow!(e))?;
139
140    // Generate fingerprints for all subtrees
141    let (root_fp, subtrees) = generate_subtree_fingerprints(&tree, 0, func.start_line);
142
143    // Create indexed function
144    let mut indexed = IndexedFunction::new(func.name.clone(), file_name.to_string(), root_fp);
145
146    // Add all subtrees to the index
147    for subtree in subtrees {
148        indexed.add_subtree(subtree);
149    }
150
151    #[cfg(test)]
152    eprintln!("Indexed {} subtrees for function {}", indexed.subtree_index.len(), func.name);
153
154    Ok(indexed)
155}
156
157/// Find overlaps with detailed similarity calculation
158pub fn find_overlaps_with_similarity(
159    source_code: &str,
160    target_code: &str,
161    options: &OverlapOptions,
162    tsed_options: &TSEDOptions,
163) -> Result<Vec<DetailedOverlap>, anyhow::Error> {
164    let overlaps = find_function_overlaps(source_code, target_code, options)?;
165    let mut detailed_overlaps = Vec::new();
166
167    for overlap in overlaps {
168        // For high-similarity overlaps, calculate exact TSED similarity
169        if overlap.similarity > 0.9 {
170            // Extract the overlapping code segments
171            let source_segment =
172                extract_code_segment(source_code, overlap.source_lines.0, overlap.source_lines.1)?;
173            let target_segment =
174                extract_code_segment(target_code, overlap.target_lines.0, overlap.target_lines.1)?;
175
176            // Parse and calculate exact similarity
177            let source_tree = parse_and_convert_to_tree("source.ts", &source_segment)
178                .map_err(|e| anyhow::anyhow!(e))?;
179            let target_tree = parse_and_convert_to_tree("target.ts", &target_segment)
180                .map_err(|e| anyhow::anyhow!(e))?;
181            let exact_similarity = calculate_tsed(&source_tree, &target_tree, tsed_options);
182
183            detailed_overlaps.push(DetailedOverlap {
184                overlap: overlap.clone(),
185                exact_similarity,
186                source_code: source_segment,
187                target_code: target_segment,
188            });
189        } else {
190            detailed_overlaps.push(DetailedOverlap {
191                overlap: overlap.clone(),
192                exact_similarity: overlap.similarity,
193                source_code: String::new(),
194                target_code: String::new(),
195            });
196        }
197    }
198
199    Ok(detailed_overlaps)
200}
201
202/// Detailed overlap with exact similarity and code snippets
203#[derive(Debug, Clone)]
204pub struct DetailedOverlap {
205    pub overlap: PartialOverlap,
206    pub exact_similarity: f64,
207    pub source_code: String,
208    pub target_code: String,
209}
210
211/// Extract code segment by line numbers
212fn extract_code_segment(
213    code: &str,
214    start_line: u32,
215    end_line: u32,
216) -> Result<String, anyhow::Error> {
217    let lines: Vec<_> = code.lines().collect();
218
219    if start_line as usize > lines.len() || end_line as usize > lines.len() {
220        return Err(anyhow::anyhow!("Line numbers out of bounds"));
221    }
222
223    let start = (start_line as usize).saturating_sub(1);
224    let end = (end_line as usize).min(lines.len());
225
226    Ok(lines[start..end].join("\n"))
227}
228
229#[cfg(test)]
230mod tests {
231    use super::*;
232
233    #[test]
234    fn test_find_function_overlaps() {
235        let source_code = r#"
236function processData(items) {
237    const results = [];
238    for (let i = 0; i < items.length; i++) {
239        if (items[i].value > 10) {
240            results.push(items[i].value * 2);
241        }
242    }
243    return results;
244}
245
246function helperFunction() {
247    const data = [];
248    for (let i = 0; i < 10; i++) {
249        data.push(i * 2);
250    }
251    return data;
252}
253"#;
254
255        let target_code = r#"
256function transformData(elements) {
257    const output = [];
258    // Similar loop structure
259    for (let j = 0; j < elements.length; j++) {
260        if (elements[j].val > 10) {
261            output.push(elements[j].val * 2);
262        }
263    }
264    return output;
265}
266
267function utilityFunction() {
268    const numbers = [];
269    // Exact same loop as helperFunction
270    for (let i = 0; i < 10; i++) {
271        numbers.push(i * 2);
272    }
273    return numbers;
274}
275"#;
276
277        let options = OverlapOptions {
278            min_window_size: 3,
279            max_window_size: 20,
280            threshold: 0.5,      // Lower threshold
281            size_tolerance: 0.5, // Higher tolerance
282        };
283
284        let overlaps = find_function_overlaps(source_code, target_code, &options).unwrap();
285
286        // Debug: print overlap count
287        eprintln!("Found {} overlaps", overlaps.len());
288        for (i, overlap) in overlaps.iter().enumerate() {
289            eprintln!(
290                "Overlap {}: {} ({} nodes, similarity: {})",
291                i, overlap.node_type, overlap.node_count, overlap.similarity
292            );
293        }
294
295        // Should find overlaps between similar functions
296        assert!(!overlaps.is_empty());
297
298        // Check that we found overlaps (may not always detect For specifically due to windowing)
299    }
300
301    #[test]
302    fn test_extract_code_segment() {
303        let code = "line1\nline2\nline3\nline4\nline5";
304
305        let segment = extract_code_segment(code, 2, 4).unwrap();
306        assert_eq!(segment, "line2\nline3\nline4");
307
308        let segment = extract_code_segment(code, 1, 5).unwrap();
309        assert_eq!(segment, "line1\nline2\nline3\nline4\nline5");
310    }
311}