similarity-core 0.5.2

Core library for code similarity detection using AST-based comparison
Documentation
use crate::{
    function_extractor::{extract_functions, FunctionDefinition},
    parser::parse_and_convert_to_tree,
    subtree_fingerprint::{
        detect_partial_overlaps, generate_subtree_fingerprints, IndexedFunction, OverlapOptions,
        PartialOverlap,
    },
    tsed::{calculate_tsed, TSEDOptions},
};
use std::collections::HashMap;

/// Detect overlapping code fragments between functions
pub fn find_function_overlaps(
    source_code: &str,
    target_code: &str,
    options: &OverlapOptions,
) -> Result<Vec<PartialOverlap>, anyhow::Error> {
    // Extract functions from both files
    let source_functions = match extract_functions("source.ts", source_code) {
        Ok(funcs) => funcs,
        Err(e) if e.contains("Parse errors:") => {
            // Skip files with parse errors silently
            return Ok(Vec::new());
        }
        Err(e) => return Err(anyhow::anyhow!(e)),
    };

    let target_functions = match extract_functions("target.ts", target_code) {
        Ok(funcs) => funcs,
        Err(e) if e.contains("Parse errors:") => {
            // Skip files with parse errors silently
            return Ok(Vec::new());
        }
        Err(e) => return Err(anyhow::anyhow!(e)),
    };

    // Parse and index functions
    let mut all_overlaps = Vec::new();

    for source_func in &source_functions {
        let source_indexed = index_function(source_func, source_code, "source.ts")?;

        for target_func in &target_functions {
            // Skip if comparing the same function in the same file
            // (but allow comparing functions with same name in different files)
            if source_func.name == target_func.name && source_code == target_code {
                continue;
            }

            let target_indexed = index_function(target_func, target_code, "target.ts")?;

            // Debug output
            #[cfg(test)]
            {
                eprintln!("Comparing {} vs {}", source_func.name, target_func.name);
                eprintln!("Source subtrees: {}", source_indexed.subtree_index.len());
                eprintln!("Target subtrees: {}", target_indexed.subtree_index.len());
            }

            // Detect overlaps
            let overlaps = detect_partial_overlaps(&source_indexed, &target_indexed, options);
            all_overlaps.extend(overlaps);
        }
    }

    Ok(all_overlaps)
}

/// Detect overlaps across multiple files
pub fn find_overlaps_across_files(
    file_contents: &HashMap<String, String>,
    options: &OverlapOptions,
) -> Result<Vec<PartialOverlapWithFiles>, anyhow::Error> {
    let mut all_overlaps = Vec::new();
    let files: Vec<_> = file_contents.keys().collect();

    // Compare each pair of files (including same file)
    for i in 0..files.len() {
        for j in i..files.len() {
            // Start from i to include same-file comparisons
            let source_file = files[i];
            let target_file = files[j];
            let source_code = &file_contents[source_file];
            let target_code = &file_contents[target_file];

            // Find overlaps between these files
            let overlaps = find_function_overlaps(source_code, target_code, options)?;

            // Add file information to overlaps
            for overlap in overlaps {
                all_overlaps.push(PartialOverlapWithFiles {
                    source_file: source_file.clone(),
                    target_file: target_file.clone(),
                    overlap,
                });
            }
        }
    }

    Ok(all_overlaps)
}

/// Overlap result with file information
#[derive(Debug, Clone)]
pub struct PartialOverlapWithFiles {
    pub source_file: String,
    pub target_file: String,
    pub overlap: PartialOverlap,
}

/// Index a function for overlap detection
fn index_function(
    func: &FunctionDefinition,
    full_code: &str,
    file_name: &str,
) -> Result<IndexedFunction, anyhow::Error> {
    // Extract the entire function to parse it properly
    // We need to find the function boundaries more accurately
    let lines: Vec<&str> = full_code.lines().collect();
    let start_line = (func.start_line as usize).saturating_sub(1);
    let end_line = func.end_line as usize;

    if start_line >= lines.len() || end_line > lines.len() {
        return Err(anyhow::anyhow!("Function line numbers out of bounds"));
    }

    let func_code = lines[start_line..end_line].join("\n");

    #[cfg(test)]
    {
        eprintln!("Indexing function {}", func.name);
        eprintln!("Lines: {} - {}", func.start_line, func.end_line);
        eprintln!("Code length: {}", func_code.len());
        eprintln!("First 100 chars: {}", &func_code.chars().take(100).collect::<String>());
    }

    // Parse the function
    let tree = parse_and_convert_to_tree(file_name, &func_code).map_err(|e| anyhow::anyhow!(e))?;

    // Generate fingerprints for all subtrees
    let (root_fp, subtrees) = generate_subtree_fingerprints(&tree, 0, func.start_line);

    // Create indexed function
    let mut indexed = IndexedFunction::new(func.name.clone(), file_name.to_string(), root_fp);

    // Add all subtrees to the index
    for subtree in subtrees {
        indexed.add_subtree(subtree);
    }

    #[cfg(test)]
    eprintln!("Indexed {} subtrees for function {}", indexed.subtree_index.len(), func.name);

    Ok(indexed)
}

/// Find overlaps with detailed similarity calculation
pub fn find_overlaps_with_similarity(
    source_code: &str,
    target_code: &str,
    options: &OverlapOptions,
    tsed_options: &TSEDOptions,
) -> Result<Vec<DetailedOverlap>, anyhow::Error> {
    let overlaps = find_function_overlaps(source_code, target_code, options)?;
    let mut detailed_overlaps = Vec::new();

    for overlap in overlaps {
        // For high-similarity overlaps, calculate exact TSED similarity
        if overlap.similarity > 0.9 {
            // Extract the overlapping code segments
            let source_segment =
                extract_code_segment(source_code, overlap.source_lines.0, overlap.source_lines.1)?;
            let target_segment =
                extract_code_segment(target_code, overlap.target_lines.0, overlap.target_lines.1)?;

            // Parse and calculate exact similarity
            let source_tree = parse_and_convert_to_tree("source.ts", &source_segment)
                .map_err(|e| anyhow::anyhow!(e))?;
            let target_tree = parse_and_convert_to_tree("target.ts", &target_segment)
                .map_err(|e| anyhow::anyhow!(e))?;
            let exact_similarity = calculate_tsed(&source_tree, &target_tree, tsed_options);

            detailed_overlaps.push(DetailedOverlap {
                overlap: overlap.clone(),
                exact_similarity,
                source_code: source_segment,
                target_code: target_segment,
            });
        } else {
            detailed_overlaps.push(DetailedOverlap {
                overlap: overlap.clone(),
                exact_similarity: overlap.similarity,
                source_code: String::new(),
                target_code: String::new(),
            });
        }
    }

    Ok(detailed_overlaps)
}

/// Detailed overlap with exact similarity and code snippets
#[derive(Debug, Clone)]
pub struct DetailedOverlap {
    pub overlap: PartialOverlap,
    pub exact_similarity: f64,
    pub source_code: String,
    pub target_code: String,
}

/// Extract code segment by line numbers
fn extract_code_segment(
    code: &str,
    start_line: u32,
    end_line: u32,
) -> Result<String, anyhow::Error> {
    let lines: Vec<_> = code.lines().collect();

    if start_line as usize > lines.len() || end_line as usize > lines.len() {
        return Err(anyhow::anyhow!("Line numbers out of bounds"));
    }

    let start = (start_line as usize).saturating_sub(1);
    let end = (end_line as usize).min(lines.len());

    Ok(lines[start..end].join("\n"))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_find_function_overlaps() {
        let source_code = r#"
function processData(items) {
    const results = [];
    for (let i = 0; i < items.length; i++) {
        if (items[i].value > 10) {
            results.push(items[i].value * 2);
        }
    }
    return results;
}

function helperFunction() {
    const data = [];
    for (let i = 0; i < 10; i++) {
        data.push(i * 2);
    }
    return data;
}
"#;

        let target_code = r#"
function transformData(elements) {
    const output = [];
    // Similar loop structure
    for (let j = 0; j < elements.length; j++) {
        if (elements[j].val > 10) {
            output.push(elements[j].val * 2);
        }
    }
    return output;
}

function utilityFunction() {
    const numbers = [];
    // Exact same loop as helperFunction
    for (let i = 0; i < 10; i++) {
        numbers.push(i * 2);
    }
    return numbers;
}
"#;

        let options = OverlapOptions {
            min_window_size: 3,
            max_window_size: 20,
            threshold: 0.5,      // Lower threshold
            size_tolerance: 0.5, // Higher tolerance
        };

        let overlaps = find_function_overlaps(source_code, target_code, &options).unwrap();

        // Debug: print overlap count
        eprintln!("Found {} overlaps", overlaps.len());
        for (i, overlap) in overlaps.iter().enumerate() {
            eprintln!(
                "Overlap {}: {} ({} nodes, similarity: {})",
                i, overlap.node_type, overlap.node_count, overlap.similarity
            );
        }

        // Should find overlaps between similar functions
        assert!(!overlaps.is_empty());

        // Check that we found overlaps (may not always detect For specifically due to windowing)
    }

    #[test]
    fn test_extract_code_segment() {
        let code = "line1\nline2\nline3\nline4\nline5";

        let segment = extract_code_segment(code, 2, 4).unwrap();
        assert_eq!(segment, "line2\nline3\nline4");

        let segment = extract_code_segment(code, 1, 5).unwrap();
        assert_eq!(segment, "line1\nline2\nline3\nline4\nline5");
    }
}