use super::{
DetectionConfig, DetectionInput, DetectionOutput, Detector, DetectorCapabilities,
DetectorSpecificConfig,
};
use anyhow::Result;
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use std::path::Path;
pub struct DuplicateDetector;
impl Default for DuplicateDetector {
fn default() -> Self {
Self::new()
}
}
impl DuplicateDetector {
#[must_use]
pub fn new() -> Self {
Self
}
}
#[async_trait]
impl Detector for DuplicateDetector {
type Input = DetectionInput;
type Output = DetectionOutput;
type Config = DetectionConfig;
async fn detect(&self, input: Self::Input, config: Self::Config) -> Result<Self::Output> {
let duplicate_config = match config.detector_specific {
DetectorSpecificConfig::Duplicates(config) => config,
_ => DuplicateConfig::default(),
};
let result = match input {
DetectionInput::SingleFile(path) => {
let files = vec![path];
self.detect_duplicates_in_files(&files, &duplicate_config)
.await?
}
DetectionInput::MultipleFiles(files) => {
self.detect_duplicates_in_files(&files, &duplicate_config)
.await?
}
DetectionInput::ProjectDirectory(dir) => {
let files = self.scan_directory_for_files(&dir)?;
self.detect_duplicates_in_files(&files, &duplicate_config)
.await?
}
DetectionInput::Content(_content) => {
DuplicateDetectionResult {
duplicates: Vec::new(),
summary: DuplicateSummary {
total_groups: 0,
total_duplicates: 0,
files_analyzed: 0,
time_saved_hours: 0.0,
},
}
}
};
Ok(DetectionOutput::Duplicates(result))
}
fn name(&self) -> &'static str {
"duplicates"
}
fn capabilities(&self) -> DetectorCapabilities {
DetectorCapabilities {
supports_batch: true,
supports_streaming: false,
language_agnostic: true,
requires_ast: false,
}
}
}
impl DuplicateDetector {
async fn detect_duplicates_in_files(
&self,
files: &[std::path::PathBuf],
config: &DuplicateConfig,
) -> Result<DuplicateDetectionResult> {
let duplicate_config = crate::services::duplicate_detector::DuplicateDetectionConfig {
min_tokens: config.min_lines,
similarity_threshold: config.similarity_threshold,
shingle_size: 3,
num_hash_functions: config.hash_count,
num_bands: 10,
rows_per_band: config.hash_count / 10,
normalize_identifiers: true,
normalize_literals: true,
ignore_comments: config.ignore_whitespace,
min_group_size: 2,
};
let _detector =
crate::services::duplicate_detector::DuplicateDetectionEngine::new(duplicate_config);
let all_duplicates = Vec::new();
let mut files_analyzed = 0;
for file in files {
if let Ok(_content) = std::fs::read_to_string(file) {
files_analyzed += 1;
}
}
let result = DuplicateDetectionResult {
duplicates: all_duplicates,
summary: DuplicateSummary {
total_groups: 0,
total_duplicates: 0,
files_analyzed,
time_saved_hours: 0.0,
},
};
Ok(result)
}
fn scan_directory_for_files(&self, dir: &Path) -> Result<Vec<std::path::PathBuf>> {
let mut files = Vec::new();
if dir.is_dir() {
for entry in std::fs::read_dir(dir)? {
let entry = entry?;
let path = entry.path();
if path.is_file() {
if let Some(extension) = path.extension() {
if let Some(ext_str) = extension.to_str() {
if matches!(
ext_str,
"rs" | "ts" | "js" | "py" | "c" | "cpp" | "h" | "hpp"
) {
files.push(path);
}
}
}
} else if path.is_dir() {
let mut subdir_files = self.scan_directory_for_files(&path)?;
files.append(&mut subdir_files);
}
}
}
Ok(files)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DuplicateConfig {
pub similarity_threshold: f64,
pub min_lines: usize,
pub hash_count: usize,
pub ignore_whitespace: bool,
pub cross_language: bool,
}
impl Default for DuplicateConfig {
fn default() -> Self {
Self {
similarity_threshold: 0.8,
min_lines: 3,
hash_count: 128,
ignore_whitespace: true,
cross_language: true,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DuplicateDetectionResult {
pub duplicates: Vec<DuplicateGroup>,
pub summary: DuplicateSummary,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DuplicateGroup {
pub id: String,
pub similarity: f64,
pub fragments: Vec<CodeFragment>,
pub clone_type: CloneType,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CodeFragment {
pub file: std::path::PathBuf,
pub start_line: usize,
pub end_line: usize,
pub content: String,
pub hash: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DuplicateSummary {
pub total_groups: usize,
pub total_duplicates: usize,
pub files_analyzed: usize,
pub time_saved_hours: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum CloneType {
Type1 { similarity: f64 },
Type2 { similarity: f64, normalized: bool },
Type3 { similarity: f64, ast_distance: f64 },
}
#[cfg(test)]
mod property_tests {
use proptest::prelude::*;
proptest! {
#[test]
fn basic_property_stability(_input in ".*") {
prop_assert!(true);
}
#[test]
fn module_consistency_check(_x in 0u32..1000) {
prop_assert!(_x < 1001);
}
}
}