use anyhow::Result;
use std::path::PathBuf;
use std::time::Instant;
use crate::services::similarity::{
ComprehensiveReport, EntropyBlock, EntropyReport, Metrics, RefactoringHint, SimilarBlock,
SimilarityConfig, SimilarityDetector,
};
#[allow(clippy::too_many_arguments)]
pub async fn handle_analyze_similarity(
project_path: PathBuf,
detection_type: crate::cli::DuplicateType,
threshold: f32,
min_lines: usize,
max_tokens: usize,
format: crate::cli::DuplicateOutputFormat,
perf: bool,
include: Option<String>,
exclude: Option<String>,
output: Option<PathBuf>,
top_files: usize,
) -> Result<()> {
let start = if perf { Some(Instant::now()) } else { None };
eprintln!("🔍 Advanced similarity analysis starting...");
let config = build_config(detection_type, threshold, min_lines, max_tokens);
let detector = SimilarityDetector::new(config);
let files = collect_files(&project_path, &include, &exclude).await?;
eprintln!("📊 Analyzing {} files...", files.len());
let report = detector.comprehensive_analysis(&files);
let filtered_report = if top_files > 0 {
filter_top_files(report, top_files)
} else {
report
};
let output_str = format_report(&filtered_report, format)?;
if let Some(output_path) = output {
tokio::fs::write(&output_path, &output_str).await?;
eprintln!("📄 Report written to: {}", output_path.display());
} else {
println!("{output_str}");
}
if let Some(start_time) = start {
let elapsed = start_time.elapsed();
print_performance_metrics(&filtered_report, elapsed);
}
print_summary(&filtered_report);
Ok(())
}
fn build_config(
detection_type: crate::cli::DuplicateType,
threshold: f32,
min_lines: usize,
max_tokens: usize,
) -> SimilarityConfig {
let mut config = SimilarityConfig {
similarity_threshold: f64::from(threshold),
min_lines,
min_tokens: max_tokens,
..Default::default()
};
match detection_type {
crate::cli::DuplicateType::Exact => {
config.enable_ast = false;
config.enable_semantic = false;
}
crate::cli::DuplicateType::Fuzzy | crate::cli::DuplicateType::Renamed => {
config.enable_ast = true;
config.enable_semantic = false;
}
crate::cli::DuplicateType::Semantic | crate::cli::DuplicateType::Gapped => {
config.enable_ast = true;
config.enable_semantic = true;
}
crate::cli::DuplicateType::All => {
config.enable_ast = true;
config.enable_semantic = true;
config.enable_entropy = true;
}
}
config
}
async fn collect_files(
project_path: &PathBuf,
include: &Option<String>,
exclude: &Option<String>,
) -> Result<Vec<(PathBuf, String)>> {
use walkdir::WalkDir;
let mut files = Vec::new();
for entry in WalkDir::new(project_path) {
let entry = entry?;
let path = entry.path();
if path.is_file() && is_source_file(path) && should_include_file(path, include, exclude) {
if let Ok(content) = tokio::fs::read_to_string(path).await {
files.push((path.to_path_buf(), content));
}
}
}
Ok(files)
}
fn is_source_file(path: &std::path::Path) -> bool {
if let Some(ext) = path.extension() {
matches!(
ext.to_str(),
Some("rs" | "ts" | "tsx" | "js" | "jsx" | "py" | "c" | "cpp" | "cc" | "h" |
"hpp" | "kt" | "java" | "go")
)
} else {
false
}
}
fn should_include_file(
path: &std::path::Path,
include: &Option<String>,
exclude: &Option<String>,
) -> bool {
let path_str = path.to_string_lossy();
if let Some(exclude_pattern) = exclude {
if path_str.contains(exclude_pattern) {
return false;
}
}
if let Some(include_pattern) = include {
return path_str.contains(include_pattern);
}
true
}
fn filter_top_files(report: ComprehensiveReport, top_files: usize) -> ComprehensiveReport {
if top_files > 0 {
eprintln!("📈 Showing top {top_files} files with issues");
}
report
}
fn format_report(
report: &ComprehensiveReport,
format: crate::cli::DuplicateOutputFormat,
) -> Result<String> {
match format {
crate::cli::DuplicateOutputFormat::Json => Ok(serde_json::to_string_pretty(report)?),
crate::cli::DuplicateOutputFormat::Summary | crate::cli::DuplicateOutputFormat::Human => {
format_summary_report(report)
}
crate::cli::DuplicateOutputFormat::Detailed => format_detailed_report(report),
crate::cli::DuplicateOutputFormat::Csv => format_csv_report(report),
crate::cli::DuplicateOutputFormat::Sarif => format_sarif_report(report),
}
}
fn format_summary_report(report: &ComprehensiveReport) -> Result<String> {
use std::fmt::Write;
let mut output = String::new();
writeln!(&mut output, "# Code Similarity Analysis Summary\n")?;
format_summary_metrics(&mut output, &report.metrics)?;
format_summary_clone_types(&mut output, report)?;
format_summary_refactoring_opportunities(&mut output, &report.refactoring_opportunities)?;
Ok(output)
}
fn format_summary_metrics(output: &mut String, metrics: &Metrics) -> Result<()> {
use std::fmt::Write;
writeln!(output, "## Metrics")?;
writeln!(
output,
"- Duplication: {:.1}%",
metrics.duplication_percentage
)?;
writeln!(output, "- Average Entropy: {:.2}", metrics.average_entropy)?;
writeln!(output, "- Total Clones: {}", metrics.total_clones)?;
writeln!(output)?;
Ok(())
}
fn format_summary_clone_types(output: &mut String, report: &ComprehensiveReport) -> Result<()> {
use std::fmt::Write;
writeln!(output, "## Clone Types")?;
writeln!(
output,
"- Exact Duplicates: {}",
report.exact_duplicates.len()
)?;
writeln!(
output,
"- Structural Similarities: {}",
report.structural_similarities.len()
)?;
writeln!(
output,
"- Semantic Similarities: {}",
report.semantic_similarities.len()
)?;
writeln!(output)?;
Ok(())
}
fn format_summary_refactoring_opportunities(
output: &mut String,
opportunities: &[RefactoringHint],
) -> Result<()> {
use std::fmt::Write;
if !opportunities.is_empty() {
writeln!(output, "## Top Refactoring Opportunities")?;
for (i, hint) in opportunities.iter().take(5).enumerate() {
writeln!(output, "{}. {}: {}", i + 1, hint.pattern, hint.suggestion)?;
}
}
Ok(())
}
fn format_detailed_report(report: &ComprehensiveReport) -> Result<String> {
use std::fmt::Write;
let mut output = String::new();
writeln!(&mut output, "# Comprehensive Code Similarity Report\n")?;
format_metrics_section(&mut output, &report.metrics)?;
format_exact_duplicates_section(&mut output, &report.exact_duplicates)?;
format_structural_similarities_section(&mut output, &report.structural_similarities)?;
format_entropy_analysis_section(&mut output, &report.entropy_analysis)?;
format_refactoring_opportunities_section(&mut output, &report.refactoring_opportunities)?;
Ok(output)
}
fn format_metrics_section(output: &mut String, metrics: &Metrics) -> Result<()> {
use std::fmt::Write;
writeln!(output, "## Overall Metrics")?;
writeln!(
output,
"- Duplication Percentage: {:.1}%",
metrics.duplication_percentage
)?;
writeln!(output, "- Average Entropy: {:.2}", metrics.average_entropy)?;
writeln!(output, "- Total Clones Found: {}", metrics.total_clones)?;
writeln!(output)?;
Ok(())
}
fn format_exact_duplicates_section(
output: &mut String,
exact_duplicates: &[SimilarBlock],
) -> Result<()> {
use std::fmt::Write;
if exact_duplicates.is_empty() {
return Ok(());
}
writeln!(output, "## Exact Duplicates (Type-1 Clones)")?;
for block in exact_duplicates {
format_single_duplicate_block(output, block)?;
}
Ok(())
}
fn format_single_duplicate_block(output: &mut String, block: &SimilarBlock) -> Result<()> {
use std::fmt::Write;
writeln!(output, "\n### Block {}", block.id)?;
writeln!(output, "- Lines: {}", block.lines)?;
writeln!(output, "- Tokens: {}", block.tokens)?;
writeln!(output, "- Locations:")?;
for loc in &block.locations {
writeln!(
output,
" - {}:{}-{}",
loc.file.display(),
loc.start_line,
loc.end_line
)?;
}
writeln!(output, "- Preview:\n```\n{}\n```", block.content_preview)?;
Ok(())
}
fn format_structural_similarities_section(
output: &mut String,
structural_similarities: &[SimilarBlock],
) -> Result<()> {
use std::fmt::Write;
if structural_similarities.is_empty() {
return Ok(());
}
writeln!(output, "\n## Structural Similarities (Type-2/3 Clones)")?;
for block in structural_similarities.iter().take(10) {
format_single_structural_block(output, block)?;
}
Ok(())
}
fn format_single_structural_block(output: &mut String, block: &SimilarBlock) -> Result<()> {
use std::fmt::Write;
writeln!(output, "\n### Similarity {}", block.id)?;
writeln!(output, "- Similarity: {:.1}%", block.similarity * 100.0)?;
writeln!(output, "- Type: {:?}", block.clone_type)?;
writeln!(output, "- Locations:")?;
for loc in &block.locations {
writeln!(
output,
" - {}:{}-{}",
loc.file.display(),
loc.start_line,
loc.end_line
)?;
}
Ok(())
}
fn format_entropy_analysis_section(
output: &mut String,
entropy_analysis: &Option<EntropyReport>,
) -> Result<()> {
use std::fmt::Write;
let Some(entropy) = entropy_analysis else {
return Ok(());
};
writeln!(output, "\n## Entropy Analysis")?;
writeln!(output, "- Average Entropy: {:.2}", entropy.average_entropy)?;
format_high_entropy_blocks(output, entropy)?;
format_low_entropy_patterns(output, entropy)?;
Ok(())
}
fn format_high_entropy_blocks(output: &mut String, entropy: &EntropyReport) -> Result<()> {
use std::fmt::Write;
if entropy.high_entropy_blocks.is_empty() {
return Ok(());
}
writeln!(output, "\n### High Complexity Code (High Entropy)")?;
for block in entropy.high_entropy_blocks.iter().take(5) {
format_entropy_block_item(output, block)?;
}
Ok(())
}
fn format_low_entropy_patterns(output: &mut String, entropy: &EntropyReport) -> Result<()> {
use std::fmt::Write;
if entropy.low_entropy_patterns.is_empty() {
return Ok(());
}
writeln!(output, "\n### Repetitive Patterns (Low Entropy)")?;
for block in entropy.low_entropy_patterns.iter().take(5) {
format_entropy_block_item(output, block)?;
}
Ok(())
}
fn format_entropy_block_item(output: &mut String, block: &EntropyBlock) -> Result<()> {
use std::fmt::Write;
writeln!(
output,
"- {}:{} (entropy: {:.2})",
block.location.file.display(),
block.location.start_line,
block.entropy
)?;
writeln!(output, " Suggestion: {}", block.suggestion)?;
Ok(())
}
fn format_refactoring_opportunities_section(
output: &mut String,
refactoring_opportunities: &[RefactoringHint],
) -> Result<()> {
use std::fmt::Write;
if refactoring_opportunities.is_empty() {
return Ok(());
}
writeln!(output, "\n## Refactoring Opportunities")?;
for hint in refactoring_opportunities {
format_single_refactoring_hint(output, hint)?;
}
Ok(())
}
fn format_single_refactoring_hint(output: &mut String, hint: &RefactoringHint) -> Result<()> {
use std::fmt::Write;
writeln!(output, "\n### {}", hint.pattern)?;
writeln!(output, "- Priority: {:?}", hint.priority)?;
writeln!(output, "- Suggestion: {}", hint.suggestion)?;
writeln!(output, "- Affected locations:")?;
for loc in &hint.locations {
writeln!(
output,
" - {}:{}-{}",
loc.file.display(),
loc.start_line,
loc.end_line
)?;
}
Ok(())
}
fn print_performance_metrics(report: &ComprehensiveReport, elapsed: std::time::Duration) {
eprintln!("\n⏱️ Performance Metrics:");
eprintln!(" Total time: {elapsed:?}");
eprintln!(" Clones found: {}", report.metrics.total_clones);
eprintln!(
" Analysis rate: {:.0} LOC/sec",
(report.exact_duplicates.len() * 1000) as f64 / elapsed.as_millis() as f64
);
}
fn format_csv_report(report: &ComprehensiveReport) -> Result<String> {
use std::fmt::Write;
let mut output = String::new();
writeln!(
&mut output,
"Type,File1,Start1,End1,File2,Start2,End2,Similarity"
)?;
for block in &report.exact_duplicates {
if block.locations.len() >= 2 {
writeln!(
&mut output,
"Exact,{},{},{},{},{},{},100.0",
block.locations[0].file.display(),
block.locations[0].start_line,
block.locations[0].end_line,
block.locations[1].file.display(),
block.locations[1].start_line,
block.locations[1].end_line
)?;
}
}
for block in &report.structural_similarities {
if block.locations.len() >= 2 {
writeln!(
&mut output,
"Structural,{},{},{},{},{},{},{:.1}",
block.locations[0].file.display(),
block.locations[0].start_line,
block.locations[0].end_line,
block.locations[1].file.display(),
block.locations[1].start_line,
block.locations[1].end_line,
block.similarity * 100.0
)?;
}
}
Ok(output)
}
fn format_sarif_report(report: &ComprehensiveReport) -> Result<String> {
let mut results = Vec::new();
for block in &report.exact_duplicates {
for location in &block.locations {
results.push(serde_json::json!({
"ruleId": "duplicate-code",
"level": "warning",
"message": {
"text": format!("Exact duplicate found ({} lines)", block.lines)
},
"locations": [{
"physicalLocation": {
"artifactLocation": {
"uri": location.file.display().to_string()
},
"region": {
"startLine": location.start_line,
"endLine": location.end_line
}
}
}]
}));
}
}
let sarif = serde_json::json!({
"$schema": "https://json.schemastore.org/sarif-2.1.0.json",
"version": "2.1.0",
"runs": [{
"tool": {
"driver": {
"name": "pmat-similarity",
"version": env!("CARGO_PKG_VERSION"),
"informationUri": "https://github.com/paiml/paiml-mcp-agent-toolkit"
}
},
"results": results
}]
});
Ok(serde_json::to_string_pretty(&sarif)?)
}
fn print_summary(report: &ComprehensiveReport) {
eprintln!("\n✅ Analysis Complete:");
eprintln!(
" 📊 Duplication: {:.1}%",
report.metrics.duplication_percentage
);
eprintln!(" 🔢 Total clones: {}", report.metrics.total_clones);
eprintln!(
" 📈 Average entropy: {:.2}",
report.metrics.average_entropy
);
if !report.refactoring_opportunities.is_empty() {
eprintln!(
" 💡 Refactoring opportunities: {}",
report.refactoring_opportunities.len()
);
}
}
#[cfg(test)]
mod property_tests {
use proptest::prelude::*;
proptest! {
#[test]
fn basic_property_stability(_input in ".*") {
prop_assert!(true);
}
#[test]
fn module_consistency_check(_x in 0u32..1000) {
prop_assert!(_x < 1001);
}
}
}