1use crate::{
2 function_extractor::{extract_functions, FunctionDefinition},
3 parser::parse_and_convert_to_tree,
4 subtree_fingerprint::{
5 detect_partial_overlaps, generate_subtree_fingerprints, IndexedFunction, OverlapOptions,
6 PartialOverlap,
7 },
8 tsed::{calculate_tsed, TSEDOptions},
9};
10use std::collections::HashMap;
11
12pub fn find_function_overlaps(
14 source_code: &str,
15 target_code: &str,
16 options: &OverlapOptions,
17) -> Result<Vec<PartialOverlap>, anyhow::Error> {
18 let source_functions = match extract_functions("source.ts", source_code) {
20 Ok(funcs) => funcs,
21 Err(e) if e.contains("Parse errors:") => {
22 return Ok(Vec::new());
24 }
25 Err(e) => return Err(anyhow::anyhow!(e)),
26 };
27
28 let target_functions = match extract_functions("target.ts", target_code) {
29 Ok(funcs) => funcs,
30 Err(e) if e.contains("Parse errors:") => {
31 return Ok(Vec::new());
33 }
34 Err(e) => return Err(anyhow::anyhow!(e)),
35 };
36
37 let mut all_overlaps = Vec::new();
39
40 for source_func in &source_functions {
41 let source_indexed = index_function(source_func, source_code, "source.ts")?;
42
43 for target_func in &target_functions {
44 if source_func.name == target_func.name && source_code == target_code {
47 continue;
48 }
49
50 let target_indexed = index_function(target_func, target_code, "target.ts")?;
51
52 #[cfg(test)]
54 {
55 eprintln!("Comparing {} vs {}", source_func.name, target_func.name);
56 eprintln!("Source subtrees: {}", source_indexed.subtree_index.len());
57 eprintln!("Target subtrees: {}", target_indexed.subtree_index.len());
58 }
59
60 let overlaps = detect_partial_overlaps(&source_indexed, &target_indexed, options);
62 all_overlaps.extend(overlaps);
63 }
64 }
65
66 Ok(all_overlaps)
67}
68
69pub fn find_overlaps_across_files(
71 file_contents: &HashMap<String, String>,
72 options: &OverlapOptions,
73) -> Result<Vec<PartialOverlapWithFiles>, anyhow::Error> {
74 let mut all_overlaps = Vec::new();
75 let files: Vec<_> = file_contents.keys().collect();
76
77 for i in 0..files.len() {
79 for j in i..files.len() {
80 let source_file = files[i];
82 let target_file = files[j];
83 let source_code = &file_contents[source_file];
84 let target_code = &file_contents[target_file];
85
86 let overlaps = find_function_overlaps(source_code, target_code, options)?;
88
89 for overlap in overlaps {
91 all_overlaps.push(PartialOverlapWithFiles {
92 source_file: source_file.clone(),
93 target_file: target_file.clone(),
94 overlap,
95 });
96 }
97 }
98 }
99
100 Ok(all_overlaps)
101}
102
103#[derive(Debug, Clone)]
105pub struct PartialOverlapWithFiles {
106 pub source_file: String,
107 pub target_file: String,
108 pub overlap: PartialOverlap,
109}
110
111fn index_function(
113 func: &FunctionDefinition,
114 full_code: &str,
115 file_name: &str,
116) -> Result<IndexedFunction, anyhow::Error> {
117 let lines: Vec<&str> = full_code.lines().collect();
120 let start_line = (func.start_line as usize).saturating_sub(1);
121 let end_line = func.end_line as usize;
122
123 if start_line >= lines.len() || end_line > lines.len() {
124 return Err(anyhow::anyhow!("Function line numbers out of bounds"));
125 }
126
127 let func_code = lines[start_line..end_line].join("\n");
128
129 #[cfg(test)]
130 {
131 eprintln!("Indexing function {}", func.name);
132 eprintln!("Lines: {} - {}", func.start_line, func.end_line);
133 eprintln!("Code length: {}", func_code.len());
134 eprintln!("First 100 chars: {}", &func_code.chars().take(100).collect::<String>());
135 }
136
137 let tree = parse_and_convert_to_tree(file_name, &func_code).map_err(|e| anyhow::anyhow!(e))?;
139
140 let (root_fp, subtrees) = generate_subtree_fingerprints(&tree, 0, func.start_line);
142
143 let mut indexed = IndexedFunction::new(func.name.clone(), file_name.to_string(), root_fp);
145
146 for subtree in subtrees {
148 indexed.add_subtree(subtree);
149 }
150
151 #[cfg(test)]
152 eprintln!("Indexed {} subtrees for function {}", indexed.subtree_index.len(), func.name);
153
154 Ok(indexed)
155}
156
157pub fn find_overlaps_with_similarity(
159 source_code: &str,
160 target_code: &str,
161 options: &OverlapOptions,
162 tsed_options: &TSEDOptions,
163) -> Result<Vec<DetailedOverlap>, anyhow::Error> {
164 let overlaps = find_function_overlaps(source_code, target_code, options)?;
165 let mut detailed_overlaps = Vec::new();
166
167 for overlap in overlaps {
168 if overlap.similarity > 0.9 {
170 let source_segment =
172 extract_code_segment(source_code, overlap.source_lines.0, overlap.source_lines.1)?;
173 let target_segment =
174 extract_code_segment(target_code, overlap.target_lines.0, overlap.target_lines.1)?;
175
176 let source_tree = parse_and_convert_to_tree("source.ts", &source_segment)
178 .map_err(|e| anyhow::anyhow!(e))?;
179 let target_tree = parse_and_convert_to_tree("target.ts", &target_segment)
180 .map_err(|e| anyhow::anyhow!(e))?;
181 let exact_similarity = calculate_tsed(&source_tree, &target_tree, tsed_options);
182
183 detailed_overlaps.push(DetailedOverlap {
184 overlap: overlap.clone(),
185 exact_similarity,
186 source_code: source_segment,
187 target_code: target_segment,
188 });
189 } else {
190 detailed_overlaps.push(DetailedOverlap {
191 overlap: overlap.clone(),
192 exact_similarity: overlap.similarity,
193 source_code: String::new(),
194 target_code: String::new(),
195 });
196 }
197 }
198
199 Ok(detailed_overlaps)
200}
201
202#[derive(Debug, Clone)]
204pub struct DetailedOverlap {
205 pub overlap: PartialOverlap,
206 pub exact_similarity: f64,
207 pub source_code: String,
208 pub target_code: String,
209}
210
211fn extract_code_segment(
213 code: &str,
214 start_line: u32,
215 end_line: u32,
216) -> Result<String, anyhow::Error> {
217 let lines: Vec<_> = code.lines().collect();
218
219 if start_line as usize > lines.len() || end_line as usize > lines.len() {
220 return Err(anyhow::anyhow!("Line numbers out of bounds"));
221 }
222
223 let start = (start_line as usize).saturating_sub(1);
224 let end = (end_line as usize).min(lines.len());
225
226 Ok(lines[start..end].join("\n"))
227}
228
229#[cfg(test)]
230mod tests {
231 use super::*;
232
233 #[test]
234 fn test_find_function_overlaps() {
235 let source_code = r#"
236function processData(items) {
237 const results = [];
238 for (let i = 0; i < items.length; i++) {
239 if (items[i].value > 10) {
240 results.push(items[i].value * 2);
241 }
242 }
243 return results;
244}
245
246function helperFunction() {
247 const data = [];
248 for (let i = 0; i < 10; i++) {
249 data.push(i * 2);
250 }
251 return data;
252}
253"#;
254
255 let target_code = r#"
256function transformData(elements) {
257 const output = [];
258 // Similar loop structure
259 for (let j = 0; j < elements.length; j++) {
260 if (elements[j].val > 10) {
261 output.push(elements[j].val * 2);
262 }
263 }
264 return output;
265}
266
267function utilityFunction() {
268 const numbers = [];
269 // Exact same loop as helperFunction
270 for (let i = 0; i < 10; i++) {
271 numbers.push(i * 2);
272 }
273 return numbers;
274}
275"#;
276
277 let options = OverlapOptions {
278 min_window_size: 3,
279 max_window_size: 20,
280 threshold: 0.5, size_tolerance: 0.5, };
283
284 let overlaps = find_function_overlaps(source_code, target_code, &options).unwrap();
285
286 eprintln!("Found {} overlaps", overlaps.len());
288 for (i, overlap) in overlaps.iter().enumerate() {
289 eprintln!(
290 "Overlap {}: {} ({} nodes, similarity: {})",
291 i, overlap.node_type, overlap.node_count, overlap.similarity
292 );
293 }
294
295 assert!(!overlaps.is_empty());
297
298 }
300
301 #[test]
302 fn test_extract_code_segment() {
303 let code = "line1\nline2\nline3\nline4\nline5";
304
305 let segment = extract_code_segment(code, 2, 4).unwrap();
306 assert_eq!(segment, "line2\nline3\nline4");
307
308 let segment = extract_code_segment(code, 1, 5).unwrap();
309 assert_eq!(segment, "line1\nline2\nline3\nline4\nline5");
310 }
311}