#![allow(clippy::uninlined_format_args)]
use crate::parallel::check_within_file_duplicates_parallel;
use similarity_core::{
cli_file_utils::collect_files,
cli_output::{format_function_output, show_function_code},
cli_parallel::SimilarityResult,
language_parser::GenericFunctionDef,
TSEDOptions,
};
use std::path::PathBuf;
struct DuplicateResult {
file1: PathBuf,
#[allow(dead_code)]
file2: PathBuf,
result: SimilarityResult<GenericFunctionDef>,
}
impl DuplicateResult {
fn priority(&self) -> f64 {
let avg_lines = ((self.result.func1.end_line - self.result.func1.start_line + 1)
+ (self.result.func2.end_line - self.result.func2.start_line + 1))
as f64
/ 2.0;
self.result.similarity * avg_lines
}
}
#[allow(clippy::too_many_arguments)]
pub fn check_paths(
paths: Vec<String>,
threshold: f64,
rename_cost: f64,
extensions: Option<&Vec<String>>,
min_lines: u32,
min_tokens: Option<u32>,
no_size_penalty: bool,
print: bool,
_fast_mode: bool, filter_function: Option<&String>,
filter_function_body: Option<&String>,
) -> anyhow::Result<usize> {
let default_extensions = vec!["py"];
let exts: Vec<&str> =
extensions.map_or(default_extensions, |v| v.iter().map(String::as_str).collect());
let files = collect_files(&paths, &exts)?;
if files.is_empty() {
println!("No Python files found in the specified paths.");
return Ok(0);
}
println!("Checking {} files for duplicates...", files.len());
let mut options = TSEDOptions::default();
options.apted_options.rename_cost = rename_cost;
options.min_lines = min_lines;
options.min_tokens = min_tokens;
options.size_penalty = !no_size_penalty;
let mut all_results = Vec::new();
let within_file_results = check_within_file_duplicates_parallel(&files, threshold, &options);
for (file, similar_pairs) in within_file_results {
for result in similar_pairs {
all_results.push(DuplicateResult { file1: file.clone(), file2: file.clone(), result });
}
}
let duplicate_count =
display_all_results(all_results, print, filter_function, filter_function_body);
Ok(duplicate_count)
}
fn display_all_results(
mut all_results: Vec<DuplicateResult>,
print: bool,
filter_function: Option<&String>,
filter_function_body: Option<&String>,
) -> usize {
if all_results.is_empty() {
println!("\nNo duplicate functions found!");
return 0;
}
if filter_function.is_some() || filter_function_body.is_some() {
all_results.retain(|dup| {
if let Some(filter) = filter_function {
if !dup.result.func1.name.contains(filter)
&& !dup.result.func2.name.contains(filter)
{
return false;
}
}
true
});
}
all_results.sort_by(|a, b| {
b.priority().partial_cmp(&a.priority()).unwrap_or(std::cmp::Ordering::Equal)
});
let mut file_groups = std::collections::HashMap::new();
for dup in all_results {
let file_path = dup.file1.to_string_lossy().to_string();
file_groups.entry(file_path).or_insert_with(Vec::new).push(dup);
}
let mut total_count = 0;
for (file_path, duplicates) in file_groups {
println!("\nDuplicates in {}:", file_path);
println!("{}", "-".repeat(60));
for dup in &duplicates {
let func1 = &dup.result.func1;
let func2 = &dup.result.func2;
println!(
" {} <-> {}",
format_function_output(
&file_path,
&format!(
"{} {}",
if func1.is_method { "method" } else { "function" },
&func1.name
),
func1.start_line,
func1.end_line
),
format_function_output(
&file_path,
&format!(
"{} {}",
if func2.is_method { "method" } else { "function" },
&func2.name
),
func2.start_line,
func2.end_line
)
);
println!(" Similarity: {:.2}%", dup.result.similarity * 100.0);
if let (Some(class1), Some(class2)) = (&func1.class_name, &func2.class_name) {
println!(" Classes: {} <-> {}", class1, class2);
}
if print {
show_function_code(&file_path, &func1.name, func1.start_line, func1.end_line);
show_function_code(&file_path, &func2.name, func2.start_line, func2.end_line);
println!();
}
total_count += 1;
}
}
println!("\nTotal duplicate pairs found: {}", total_count);
total_count
}