#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DuplicateBlock {
pub hash: String,
pub locations: Vec<DuplicateLocation>,
pub lines: usize,
pub tokens: usize,
pub similarity: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DuplicateLocation {
pub file: String,
pub start_line: usize,
pub end_line: usize,
pub content_preview: String,
}
#[derive(Debug, Serialize)]
pub struct DuplicateReport {
pub total_duplicates: usize,
pub duplicate_lines: usize,
pub total_lines: usize,
pub duplication_percentage: f32,
pub duplicate_blocks: Vec<DuplicateBlock>,
pub file_statistics: HashMap<String, FileStats>,
}
#[derive(Debug, Serialize)]
pub struct FileStats {
pub duplicate_lines: usize,
pub total_lines: usize,
pub duplication_percentage: f32,
}
#[allow(clippy::too_many_arguments)]
pub async fn handle_analyze_duplicates(
project_path: PathBuf,
detection_type: crate::cli::DuplicateType,
threshold: f32,
min_lines: usize,
max_tokens: usize,
format: crate::cli::DuplicateOutputFormat,
perf: bool,
include: Option<String>,
exclude: Option<String>,
output: Option<PathBuf>,
top_files: usize,
) -> Result<()> {
{
use crate::cli::colors as c;
eprintln!("{}", c::dim("Analyzing code similarity..."));
}
let start_time = std::time::Instant::now();
let mut report = run_duplicate_detection(
&project_path,
detection_type,
threshold,
min_lines,
max_tokens,
&include,
&exclude,
)
.await?;
apply_top_files_filtering(&mut report, top_files);
print_duplicate_summary(&report);
if perf {
use crate::cli::colors as c;
let duration = start_time.elapsed();
eprintln!("\n{}Performance Metrics:{}", c::BOLD, c::RESET);
eprintln!(" {}Analysis time:{} {}{:.2}ms{}", c::BOLD, c::RESET, c::BOLD_WHITE, duration.as_millis(), c::RESET);
eprintln!(" {}Files processed:{} {}{}{}", c::BOLD, c::RESET, c::BOLD_WHITE, report.file_statistics.len(), c::RESET);
eprintln!(" {}Blocks analyzed:{} {}{}{}", c::BOLD, c::RESET, c::BOLD_WHITE, report.duplicate_blocks.len(), c::RESET);
}
{
use crate::cli::colors as c;
eprintln!("\n{}", c::pass("Analysis Complete"));
}
write_duplicate_output(&report, format, output).await
}
async fn run_duplicate_detection(
project_path: &Path,
detection_type: crate::cli::DuplicateType,
threshold: f32,
min_lines: usize,
max_tokens: usize,
include: &Option<String>,
exclude: &Option<String>,
) -> Result<DuplicateReport> {
detect_duplicates(
project_path,
detection_type,
threshold,
min_lines,
max_tokens,
include,
exclude,
)
.await
}
fn apply_top_files_filtering(report: &mut DuplicateReport, top_files: usize) {
if top_files == 0 {
return;
}
let top_file_names = get_top_files_by_duplication(&report.file_statistics, top_files);
filter_blocks_by_files(report, &top_file_names);
recalculate_statistics_after_filtering(report);
}
fn get_top_files_by_duplication(
file_statistics: &HashMap<String, FileStats>,
top_files: usize,
) -> std::collections::HashSet<String> {
let mut file_stats: Vec<_> = file_statistics.iter().collect();
file_stats.sort_by(|a, b| {
b.1.duplication_percentage
.partial_cmp(&a.1.duplication_percentage)
.unwrap_or(std::cmp::Ordering::Equal)
});
file_stats
.into_iter()
.take(top_files)
.map(|(name, _)| name.clone())
.collect()
}
fn filter_blocks_by_files(
report: &mut DuplicateReport,
top_file_names: &std::collections::HashSet<String>,
) {
report.duplicate_blocks.retain(|block| {
block
.locations
.iter()
.any(|loc| top_file_names.contains(&loc.file))
});
}
fn recalculate_statistics_after_filtering(report: &mut DuplicateReport) {
let mut duplicate_lines = 0;
for block in &report.duplicate_blocks {
duplicate_lines += block.lines * block.locations.len();
}
report.duplicate_lines = duplicate_lines;
report.total_duplicates = report.duplicate_blocks.len();
if report.total_lines > 0 {
report.duplication_percentage =
(duplicate_lines as f32 / report.total_lines as f32) * 100.0;
}
}
fn print_duplicate_summary(report: &DuplicateReport) {
use crate::cli::colors as c;
eprintln!(
"{} Found {} duplicate blocks",
c::pass(""),
c::number(&report.total_duplicates.to_string())
);
eprintln!(
" {}Duplication:{} {} ({} / {} lines)",
c::BOLD, c::RESET,
c::pct(report.duplication_percentage as f64, 5.0, 15.0),
c::number(&report.duplicate_lines.to_string()),
c::number(&report.total_lines.to_string()),
);
}
async fn write_duplicate_output(
report: &DuplicateReport,
format: crate::cli::DuplicateOutputFormat,
output: Option<PathBuf>,
) -> Result<()> {
let content = format_output(report, format)?;
if let Some(output_path) = output {
tokio::fs::write(&output_path, &content).await?;
eprintln!("📄 Report written to: {}", output_path.display());
} else {
println!("{content}");
}
Ok(())
}
async fn detect_duplicates(
project_path: &Path,
detection_type: crate::cli::DuplicateType,
threshold: f32,
min_lines: usize,
max_tokens: usize,
include: &Option<String>,
exclude: &Option<String>,
) -> Result<DuplicateReport> {
let (all_blocks, total_lines, mut file_stats) = collect_code_blocks(
project_path,
detection_type,
min_lines,
max_tokens,
include,
exclude,
)
.await?;
let duplicate_blocks = find_duplicate_blocks(all_blocks, threshold);
let duplicate_lines = calculate_duplicate_statistics(&duplicate_blocks, &mut file_stats);
let duplication_percentage = calculate_duplication_percentage(duplicate_lines, total_lines);
Ok(build_duplicate_report(
duplicate_blocks,
duplicate_lines,
total_lines,
duplication_percentage,
file_stats,
))
}
async fn collect_code_blocks(
project_path: &Path,
detection_type: crate::cli::DuplicateType,
min_lines: usize,
max_tokens: usize,
include: &Option<String>,
exclude: &Option<String>,
) -> Result<(
Vec<(String, String, usize, usize, String)>,
usize,
HashMap<String, FileStats>,
)> {
use walkdir::WalkDir;
let mut all_blocks = Vec::new();
let mut total_lines = 0usize;
let mut file_stats = HashMap::new();
for entry in WalkDir::new(project_path) {
let entry = entry?;
let path = entry.path();
if should_analyze_file(path, include, exclude) {
if let Some((blocks, lines_count)) =
process_source_file(path, detection_type.clone(), min_lines, max_tokens).await
{
all_blocks.extend(blocks);
total_lines += lines_count;
file_stats.insert(
path.to_string_lossy().to_string(),
FileStats {
duplicate_lines: 0,
total_lines: lines_count,
duplication_percentage: 0.0,
},
);
}
}
}
Ok((all_blocks, total_lines, file_stats))
}
fn should_analyze_file(path: &Path, include: &Option<String>, exclude: &Option<String>) -> bool {
path.is_file() && is_source_file(path) && should_process_file(path, include, exclude)
}
async fn process_source_file(
path: &Path,
detection_type: crate::cli::DuplicateType,
min_lines: usize,
max_tokens: usize,
) -> Option<(Vec<(String, String, usize, usize, String)>, usize)> {
if let Ok(content) = tokio::fs::read_to_string(path).await {
let lines: Vec<&str> = content.lines().collect();
let blocks = extract_blocks(&lines, path, min_lines, max_tokens, detection_type);
Some((blocks, lines.len()))
} else {
None
}
}
fn calculate_duplicate_statistics(
duplicate_blocks: &[DuplicateBlock],
file_stats: &mut HashMap<String, FileStats>,
) -> usize {
let mut duplicate_lines = 0;
for block in duplicate_blocks {
duplicate_lines += block.lines * block.locations.len();
for loc in &block.locations {
if let Some(stats) = file_stats.get_mut(&loc.file) {
stats.duplicate_lines += block.lines;
}
}
}
update_file_duplication_percentages(file_stats);
duplicate_lines
}
fn update_file_duplication_percentages(file_stats: &mut HashMap<String, FileStats>) {
for stats in file_stats.values_mut() {
if stats.total_lines > 0 {
stats.duplication_percentage =
(stats.duplicate_lines as f32 / stats.total_lines as f32) * 100.0;
}
}
}
fn calculate_duplication_percentage(duplicate_lines: usize, total_lines: usize) -> f32 {
if total_lines > 0 {
(duplicate_lines as f32 / total_lines as f32) * 100.0
} else {
0.0
}
}
fn build_duplicate_report(
duplicate_blocks: Vec<DuplicateBlock>,
duplicate_lines: usize,
total_lines: usize,
duplication_percentage: f32,
file_stats: HashMap<String, FileStats>,
) -> DuplicateReport {
DuplicateReport {
total_duplicates: duplicate_blocks.len(),
duplicate_lines,
total_lines,
duplication_percentage,
duplicate_blocks,
file_statistics: file_stats,
}
}