similarity-mbt 0.5.2

CLI tool for detecting code duplication in MoonBit projects
Documentation
#![allow(clippy::uninlined_format_args)]

use crate::parallel::check_within_file_duplicates_parallel;
use similarity_core::{
    cli_file_utils::collect_files,
    cli_output::{format_function_output, show_function_code},
    cli_parallel::SimilarityResult,
    language_parser::GenericFunctionDef,
    TSEDOptions,
};
use std::path::PathBuf;

/// Structure to hold all similarity results
struct DuplicateResult {
    file1: PathBuf,
    #[allow(dead_code)]
    file2: PathBuf,
    result: SimilarityResult<GenericFunctionDef>,
}

impl DuplicateResult {
    fn priority(&self) -> f64 {
        let avg_lines = ((self.result.func1.end_line - self.result.func1.start_line + 1)
            + (self.result.func2.end_line - self.result.func2.start_line + 1))
            as f64
            / 2.0;
        self.result.similarity * avg_lines
    }
}

#[allow(clippy::too_many_arguments)]
pub fn check_paths(
    paths: Vec<String>,
    threshold: f64,
    rename_cost: f64,
    extensions: Option<&Vec<String>>,
    min_lines: u32,
    min_tokens: Option<u32>,
    no_size_penalty: bool,
    print: bool,
    _fast_mode: bool,
    filter_function: Option<&String>,
    filter_function_body: Option<&String>,
) -> anyhow::Result<usize> {
    let default_extensions = vec!["mbt"];
    let exts: Vec<&str> =
        extensions.map_or(default_extensions, |v| v.iter().map(String::as_str).collect());

    let files = collect_files(&paths, &exts)?;

    if files.is_empty() {
        println!("No MoonBit files found in the specified paths.");
        return Ok(0);
    }

    println!("Checking {} files for duplicates...", files.len());

    let mut options = TSEDOptions::default();
    options.apted_options.rename_cost = rename_cost;
    options.min_lines = min_lines;
    options.min_tokens = min_tokens;
    options.size_penalty = !no_size_penalty;

    let mut all_results = Vec::new();

    let within_file_results = check_within_file_duplicates_parallel(&files, threshold, &options);

    for (file, similar_pairs) in within_file_results {
        for result in similar_pairs {
            all_results.push(DuplicateResult { file1: file.clone(), file2: file.clone(), result });
        }
    }

    let duplicate_count =
        display_all_results(all_results, print, filter_function, filter_function_body);

    Ok(duplicate_count)
}

/// Display similarity results
fn display_all_results(
    mut all_results: Vec<DuplicateResult>,
    print: bool,
    filter_function: Option<&String>,
    filter_function_body: Option<&String>,
) -> usize {
    if all_results.is_empty() {
        println!("\nNo duplicate functions found!");
        return 0;
    }

    if let Some(filter) = filter_function {
        all_results.retain(|dup| {
            dup.result.func1.name.contains(filter) || dup.result.func2.name.contains(filter)
        });
    }

    if let Some(body_filter) = filter_function_body {
        all_results.retain(|dup| {
            let file_path = dup.file1.to_string_lossy();
            if let Ok(content) = std::fs::read_to_string(file_path.as_ref()) {
                let lines: Vec<&str> = content.lines().collect();
                let body1 = extract_body_lines(&lines, &dup.result.func1);
                let body2 = extract_body_lines(&lines, &dup.result.func2);
                body1.contains(body_filter) || body2.contains(body_filter)
            } else {
                false
            }
        });
    }

    all_results.sort_by(|a, b| {
        b.priority().partial_cmp(&a.priority()).unwrap_or(std::cmp::Ordering::Equal)
    });

    let mut file_groups = std::collections::HashMap::new();
    for dup in all_results {
        let file_path = dup.file1.to_string_lossy().to_string();
        file_groups.entry(file_path).or_insert_with(Vec::new).push(dup);
    }

    let mut total_count = 0;
    for (file_path, duplicates) in file_groups {
        println!("\nDuplicates in {}:", file_path);
        println!("{}", "-".repeat(60));

        for dup in &duplicates {
            let func1 = &dup.result.func1;
            let func2 = &dup.result.func2;

            println!(
                "  {} <-> {}",
                format_function_output(
                    &file_path,
                    &format!(
                        "{} {}",
                        if func1.is_method { "method" } else { "function" },
                        &func1.name
                    ),
                    func1.start_line,
                    func1.end_line
                ),
                format_function_output(
                    &file_path,
                    &format!(
                        "{} {}",
                        if func2.is_method { "method" } else { "function" },
                        &func2.name
                    ),
                    func2.start_line,
                    func2.end_line
                )
            );
            println!("  Similarity: {:.2}%", dup.result.similarity * 100.0);

            if let (Some(class1), Some(class2)) = (&func1.class_name, &func2.class_name) {
                println!("  Types: {} <-> {}", class1, class2);
            }

            if print {
                show_function_code(&file_path, &func1.name, func1.start_line, func1.end_line);
                show_function_code(&file_path, &func2.name, func2.start_line, func2.end_line);
                println!();
            }

            total_count += 1;
        }
    }

    println!("\nTotal duplicate pairs found: {}", total_count);

    total_count
}

fn extract_body_lines(lines: &[&str], func: &GenericFunctionDef) -> String {
    let start = (func.body_start_line.saturating_sub(1)) as usize;
    let end = std::cmp::min(func.body_end_line as usize, lines.len());
    if start >= lines.len() {
        return String::new();
    }
    lines[start..end].join("\n")
}