use crate::{
function_extractor::{extract_functions, FunctionDefinition},
parser::parse_and_convert_to_tree,
subtree_fingerprint::{
detect_partial_overlaps, generate_subtree_fingerprints, IndexedFunction, OverlapOptions,
PartialOverlap,
},
tsed::{calculate_tsed, TSEDOptions},
};
use std::collections::HashMap;
pub fn find_function_overlaps(
source_code: &str,
target_code: &str,
options: &OverlapOptions,
) -> Result<Vec<PartialOverlap>, anyhow::Error> {
let source_functions = match extract_functions("source.ts", source_code) {
Ok(funcs) => funcs,
Err(e) if e.contains("Parse errors:") => {
return Ok(Vec::new());
}
Err(e) => return Err(anyhow::anyhow!(e)),
};
let target_functions = match extract_functions("target.ts", target_code) {
Ok(funcs) => funcs,
Err(e) if e.contains("Parse errors:") => {
return Ok(Vec::new());
}
Err(e) => return Err(anyhow::anyhow!(e)),
};
let mut all_overlaps = Vec::new();
for source_func in &source_functions {
let source_indexed = index_function(source_func, source_code, "source.ts")?;
for target_func in &target_functions {
if source_func.name == target_func.name && source_code == target_code {
continue;
}
let target_indexed = index_function(target_func, target_code, "target.ts")?;
#[cfg(test)]
{
eprintln!("Comparing {} vs {}", source_func.name, target_func.name);
eprintln!("Source subtrees: {}", source_indexed.subtree_index.len());
eprintln!("Target subtrees: {}", target_indexed.subtree_index.len());
}
let overlaps = detect_partial_overlaps(&source_indexed, &target_indexed, options);
all_overlaps.extend(overlaps);
}
}
Ok(all_overlaps)
}
pub fn find_overlaps_across_files(
file_contents: &HashMap<String, String>,
options: &OverlapOptions,
) -> Result<Vec<PartialOverlapWithFiles>, anyhow::Error> {
let mut all_overlaps = Vec::new();
let files: Vec<_> = file_contents.keys().collect();
for i in 0..files.len() {
for j in i..files.len() {
let source_file = files[i];
let target_file = files[j];
let source_code = &file_contents[source_file];
let target_code = &file_contents[target_file];
let overlaps = find_function_overlaps(source_code, target_code, options)?;
for overlap in overlaps {
all_overlaps.push(PartialOverlapWithFiles {
source_file: source_file.clone(),
target_file: target_file.clone(),
overlap,
});
}
}
}
Ok(all_overlaps)
}
#[derive(Debug, Clone)]
pub struct PartialOverlapWithFiles {
pub source_file: String,
pub target_file: String,
pub overlap: PartialOverlap,
}
fn index_function(
func: &FunctionDefinition,
full_code: &str,
file_name: &str,
) -> Result<IndexedFunction, anyhow::Error> {
let lines: Vec<&str> = full_code.lines().collect();
let start_line = (func.start_line as usize).saturating_sub(1);
let end_line = func.end_line as usize;
if start_line >= lines.len() || end_line > lines.len() {
return Err(anyhow::anyhow!("Function line numbers out of bounds"));
}
let func_code = lines[start_line..end_line].join("\n");
#[cfg(test)]
{
eprintln!("Indexing function {}", func.name);
eprintln!("Lines: {} - {}", func.start_line, func.end_line);
eprintln!("Code length: {}", func_code.len());
eprintln!("First 100 chars: {}", &func_code.chars().take(100).collect::<String>());
}
let tree = parse_and_convert_to_tree(file_name, &func_code).map_err(|e| anyhow::anyhow!(e))?;
let (root_fp, subtrees) = generate_subtree_fingerprints(&tree, 0, func.start_line);
let mut indexed = IndexedFunction::new(func.name.clone(), file_name.to_string(), root_fp);
for subtree in subtrees {
indexed.add_subtree(subtree);
}
#[cfg(test)]
eprintln!("Indexed {} subtrees for function {}", indexed.subtree_index.len(), func.name);
Ok(indexed)
}
pub fn find_overlaps_with_similarity(
source_code: &str,
target_code: &str,
options: &OverlapOptions,
tsed_options: &TSEDOptions,
) -> Result<Vec<DetailedOverlap>, anyhow::Error> {
let overlaps = find_function_overlaps(source_code, target_code, options)?;
let mut detailed_overlaps = Vec::new();
for overlap in overlaps {
if overlap.similarity > 0.9 {
let source_segment =
extract_code_segment(source_code, overlap.source_lines.0, overlap.source_lines.1)?;
let target_segment =
extract_code_segment(target_code, overlap.target_lines.0, overlap.target_lines.1)?;
let source_tree = parse_and_convert_to_tree("source.ts", &source_segment)
.map_err(|e| anyhow::anyhow!(e))?;
let target_tree = parse_and_convert_to_tree("target.ts", &target_segment)
.map_err(|e| anyhow::anyhow!(e))?;
let exact_similarity = calculate_tsed(&source_tree, &target_tree, tsed_options);
detailed_overlaps.push(DetailedOverlap {
overlap: overlap.clone(),
exact_similarity,
source_code: source_segment,
target_code: target_segment,
});
} else {
detailed_overlaps.push(DetailedOverlap {
overlap: overlap.clone(),
exact_similarity: overlap.similarity,
source_code: String::new(),
target_code: String::new(),
});
}
}
Ok(detailed_overlaps)
}
#[derive(Debug, Clone)]
pub struct DetailedOverlap {
pub overlap: PartialOverlap,
pub exact_similarity: f64,
pub source_code: String,
pub target_code: String,
}
fn extract_code_segment(
code: &str,
start_line: u32,
end_line: u32,
) -> Result<String, anyhow::Error> {
let lines: Vec<_> = code.lines().collect();
if start_line as usize > lines.len() || end_line as usize > lines.len() {
return Err(anyhow::anyhow!("Line numbers out of bounds"));
}
let start = (start_line as usize).saturating_sub(1);
let end = (end_line as usize).min(lines.len());
Ok(lines[start..end].join("\n"))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_find_function_overlaps() {
let source_code = r#"
function processData(items) {
const results = [];
for (let i = 0; i < items.length; i++) {
if (items[i].value > 10) {
results.push(items[i].value * 2);
}
}
return results;
}
function helperFunction() {
const data = [];
for (let i = 0; i < 10; i++) {
data.push(i * 2);
}
return data;
}
"#;
let target_code = r#"
function transformData(elements) {
const output = [];
// Similar loop structure
for (let j = 0; j < elements.length; j++) {
if (elements[j].val > 10) {
output.push(elements[j].val * 2);
}
}
return output;
}
function utilityFunction() {
const numbers = [];
// Exact same loop as helperFunction
for (let i = 0; i < 10; i++) {
numbers.push(i * 2);
}
return numbers;
}
"#;
let options = OverlapOptions {
min_window_size: 3,
max_window_size: 20,
threshold: 0.5, size_tolerance: 0.5, };
let overlaps = find_function_overlaps(source_code, target_code, &options).unwrap();
eprintln!("Found {} overlaps", overlaps.len());
for (i, overlap) in overlaps.iter().enumerate() {
eprintln!(
"Overlap {}: {} ({} nodes, similarity: {})",
i, overlap.node_type, overlap.node_count, overlap.similarity
);
}
assert!(!overlaps.is_empty());
}
#[test]
fn test_extract_code_segment() {
let code = "line1\nline2\nline3\nline4\nline5";
let segment = extract_code_segment(code, 2, 4).unwrap();
assert_eq!(segment, "line2\nline3\nline4");
let segment = extract_code_segment(code, 1, 5).unwrap();
assert_eq!(segment, "line1\nline2\nline3\nline4\nline5");
}
}