use crate::detectors::base::{is_test_file, Detector, DetectorScope};
use crate::graph::interner::StrKey;
use crate::graph::GraphQueryExt;
use crate::models::{Finding, Severity};
use anyhow::Result;
use std::collections::{HashMap, HashSet};
use std::hash::{Hash, Hasher};
use std::path::PathBuf;
use tracing::info;
const DEFAULT_MIN_LINES: usize = 6;
const MIN_BLOCK_CONTENT_LEN: usize = 80;
const SKIP_PATH_SEGMENTS: &[&str] = &[
"fixtures/",
"fixture/",
"generated/",
"__generated__",
".generated.",
"proto/",
"migrations/",
"vendor/",
"node_modules/",
"dist/",
"snapshots/",
"_pb2.py",
"conftest",
];
pub struct DuplicateCodeDetector {
#[allow(dead_code)] repository_path: PathBuf,
max_findings: usize,
default_min_lines: usize,
}
impl DuplicateCodeDetector {
pub fn new(repository_path: impl Into<PathBuf>) -> Self {
Self {
repository_path: repository_path.into(),
max_findings: 50,
default_min_lines: DEFAULT_MIN_LINES,
}
}
fn normalize_line(line: &str) -> String {
let trimmed = line.trim();
if trimmed.starts_with("//") || trimmed.starts_with("#") || trimmed.starts_with("*") {
return String::new();
}
trimmed.split_whitespace().collect::<Vec<_>>().join(" ")
}
fn hash_block(block: &str) -> u64 {
let mut hasher = std::collections::hash_map::DefaultHasher::new();
block.hash(&mut hasher);
hasher.finish()
}
fn is_skip_path(path: &std::path::Path) -> bool {
let path_str = path.to_string_lossy();
let lower = path_str.to_lowercase();
SKIP_PATH_SEGMENTS.iter().any(|seg| lower.contains(seg))
}
fn find_containing_functions(
&self,
graph: &dyn crate::graph::GraphQuery,
locations: &[(PathBuf, usize)],
) -> Vec<Option<StrKey>> {
locations
.iter()
.map(|(path, line)| {
let path_str = path.to_string_lossy();
graph
.find_function_at(&path_str, *line as u32)
.map(|f| f.qualified_name)
})
.collect()
}
fn is_trait_impl(qn: &str) -> bool {
qn.contains("impl<") && qn.contains(" for ")
}
fn all_trait_impls(
graph: &dyn crate::graph::GraphQuery,
containing_funcs: &[Option<StrKey>],
) -> bool {
let interner = graph.interner();
let has_some = containing_funcs.iter().any(|f| f.is_some());
if !has_some {
return false;
}
containing_funcs.iter().all(|f| {
f.is_some_and(|qn_key| {
let qn = interner.resolve(qn_key);
Self::is_trait_impl(qn)
})
})
}
fn all_infrastructure(
ctx: &crate::detectors::analysis_context::AnalysisContext,
containing_funcs: &[Option<StrKey>],
) -> bool {
let interner = ctx.graph.interner();
let has_some = containing_funcs.iter().any(|f| f.is_some());
if !has_some {
return false;
}
containing_funcs.iter().all(|f| {
f.is_some_and(|qn_key| {
let qn = interner.resolve(qn_key);
ctx.is_infrastructure(qn)
})
})
}
fn all_test_functions(
ctx: &crate::detectors::analysis_context::AnalysisContext,
containing_funcs: &[Option<StrKey>],
) -> bool {
let interner = ctx.graph.interner();
let has_some = containing_funcs.iter().any(|f| f.is_some());
if !has_some {
return false;
}
containing_funcs.iter().all(|f| {
f.is_some_and(|qn_key| {
let qn = interner.resolve(qn_key);
ctx.is_test_function(qn)
})
})
}
fn deduplicate_locations(
locations: &[(PathBuf, usize)],
min_lines: usize,
) -> Vec<(PathBuf, usize)> {
let mut by_file: HashMap<&PathBuf, Vec<usize>> = HashMap::new();
for (path, line) in locations {
by_file.entry(path).or_default().push(*line);
}
let mut deduped = Vec::new();
for (path, mut lines) in by_file {
lines.sort_unstable();
let mut last_kept: Option<usize> = None;
for line in lines {
if let Some(prev) = last_kept {
if line.saturating_sub(prev) < min_lines {
continue; }
}
last_kept = Some(line);
deduped.push((path.clone(), line));
}
}
deduped.sort_by(|a, b| a.0.cmp(&b.0).then(a.1.cmp(&b.1)));
deduped
}
fn analyze_caller_similarity(
&self,
graph: &dyn crate::graph::GraphQuery,
containing_funcs: &[Option<StrKey>],
) -> (usize, String) {
let i = graph.interner();
let valid_funcs: Vec<StrKey> = containing_funcs.iter().filter_map(|f| *f).collect();
if valid_funcs.len() < 2 {
return (0, String::new());
}
let caller_sets: Vec<HashSet<StrKey>> = valid_funcs
.iter()
.map(|&qn| {
graph
.get_callers(i.resolve(qn))
.into_iter()
.map(|c| c.qualified_name)
.collect()
})
.collect();
if caller_sets.is_empty() {
return (0, String::new());
}
let common_callers: HashSet<StrKey> = caller_sets[0]
.iter()
.filter(|caller| caller_sets.iter().skip(1).all(|set| set.contains(*caller)))
.copied()
.collect();
let suggestion = if !common_callers.is_empty() {
let func_path_map: HashMap<StrKey, StrKey> = graph
.get_functions()
.into_iter()
.map(|f| (f.qualified_name, f.file_path))
.collect();
let mut module_counts: HashMap<String, usize> = HashMap::new();
for caller_key in &common_callers {
if let Some(&path_key) = func_path_map.get(caller_key) {
let path = i.resolve(path_key);
let module = path.rsplit('/').nth(1).unwrap_or("utils").to_string();
*module_counts.entry(module).or_default() += 1;
}
}
module_counts
.into_iter()
.max_by_key(|(_, count)| *count)
.map(|(module, _)| module)
.unwrap_or_else(|| "utils".to_string())
} else {
String::new()
};
(common_callers.len(), suggestion)
}
}
impl Detector for DuplicateCodeDetector {
fn name(&self) -> &'static str {
"duplicate-code"
}
fn description(&self) -> &'static str {
"Detects copy-pasted code blocks"
}
fn requires_graph(&self) -> bool {
false
}
fn detector_scope(&self) -> DetectorScope {
DetectorScope::FileScopedGraph
}
fn file_extensions(&self) -> &'static [&'static str] {
&[
"py", "js", "ts", "jsx", "tsx", "rb", "java", "go", "rs", "c", "cpp", "cs",
]
}
fn bypass_postprocessor(&self) -> bool {
true
}
fn detect(
&self,
ctx: &crate::detectors::analysis_context::AnalysisContext,
) -> Result<Vec<Finding>> {
let graph = ctx.graph;
let files = &ctx.as_file_provider();
let mut findings = vec![];
use rayon::prelude::*;
let adaptive_min = ctx.threshold(
crate::calibrate::MetricKind::FunctionLength,
self.default_min_lines as f64,
);
let min_lines = (adaptive_min as usize).clamp(DEFAULT_MIN_LINES, 15);
let source_files = files.files_with_extensions(&[
"py", "js", "ts", "jsx", "tsx", "java", "go", "rs", "rb", "php", "c", "cpp",
]);
let per_file: Vec<Vec<(u64, PathBuf, usize)>> = source_files
.par_iter()
.filter(|path| !is_test_file(path))
.filter(|path| !Self::is_skip_path(path))
.filter_map(|path| {
files.content(path).map(|content| {
let normalized: Vec<String> =
content.lines().map(Self::normalize_line).collect();
let mut file_blocks = Vec::new();
for i in 0..normalized.len().saturating_sub(min_lines) {
let block: String = normalized[i..i + min_lines]
.iter()
.filter(|l| !l.is_empty())
.cloned()
.collect::<Vec<_>>()
.join("\n");
if block.len() > MIN_BLOCK_CONTENT_LEN {
let hash = Self::hash_block(&block);
file_blocks.push((hash, path.to_path_buf(), i + 1));
}
}
file_blocks
})
})
.collect();
let mut blocks: HashMap<u64, Vec<(PathBuf, usize)>> = HashMap::new();
for file_blocks in per_file {
for (hash, path, line) in file_blocks {
blocks.entry(hash).or_default().push((path, line));
}
}
let mut sorted_blocks: Vec<_> = blocks.into_iter().collect();
sorted_blocks.sort_by_key(|(hash, _)| *hash);
for (_block, raw_locations) in sorted_blocks {
if raw_locations.len() <= 1 || findings.len() >= self.max_findings {
continue;
}
let locations = Self::deduplicate_locations(&raw_locations, min_lines);
if locations.len() <= 1 {
continue; }
let affected: Vec<_> = locations.iter().map(|(p, _)| p.clone()).collect();
let first_line = locations[0].1;
if affected.iter().all(|p| is_test_file(p)) {
continue;
}
if affected.iter().any(|p| Self::is_skip_path(p)) {
continue;
}
let containing_funcs = self.find_containing_functions(graph, &locations);
let (common_callers, suggested_module) =
self.analyze_caller_similarity(graph, &containing_funcs);
if Self::all_test_functions(ctx, &containing_funcs) {
continue;
}
let mut severity = if common_callers >= 2 {
Severity::High } else if locations.len() > 3 {
Severity::Medium
} else {
Severity::Low
};
if Self::all_infrastructure(ctx, &containing_funcs) {
severity = Severity::Low;
}
if Self::all_trait_impls(graph, &containing_funcs) {
severity = Severity::Low;
}
let caller_note = if common_callers > 0 {
format!("\n\n{} common caller(s) call all duplicate locations - strong refactor signal.", common_callers)
} else {
String::new()
};
let suggestion = if !suggested_module.is_empty() && common_callers > 0 {
format!(
"Extract into a shared function in the `{}` module (where most callers are).",
suggested_module
)
} else {
"Extract into a shared function.".to_string()
};
let interner = graph.interner();
let func_list: Vec<String> = containing_funcs
.iter()
.zip(locations.iter())
.filter_map(|(f, (path, line))| {
f.map(|qn_key| {
let qn = interner.resolve(qn_key);
let name = qn.rsplit("::").next().unwrap_or(qn);
format!(
" - `{}` ({}:{})",
name,
path.file_name().unwrap_or_default().to_string_lossy(),
line
)
})
})
.take(5)
.collect();
let func_note = if !func_list.is_empty() {
format!("\n\n**Found in functions:**\n{}", func_list.join("\n"))
} else {
String::new()
};
findings.push(Finding {
id: String::new(),
detector: "DuplicateCodeDetector".to_string(),
severity,
title: format!("Duplicate code ({} occurrences)", locations.len()),
description: format!(
"Same code block found in **{} places**.{}{}",
locations.len(),
func_note,
caller_note
),
affected_files: affected,
line_start: Some(first_line as u32),
line_end: Some((first_line + min_lines) as u32),
suggested_fix: Some(suggestion),
estimated_effort: Some(if common_callers >= 2 { "30 minutes".to_string() } else { "20 minutes".to_string() }),
category: Some("maintainability".to_string()),
cwe_id: None,
why_it_matters: Some(
"Duplicate code means duplicate bugs. When you fix one, you must remember to fix all copies."
.to_string()
),
..Default::default()
});
}
info!(
"DuplicateCodeDetector found {} findings (graph-aware)",
findings.len()
);
Ok(findings)
}
}
impl crate::detectors::RegisteredDetector for DuplicateCodeDetector {
fn create(init: &crate::detectors::DetectorInit) -> std::sync::Arc<dyn Detector> {
std::sync::Arc::new(Self::new(init.repo_path))
}
fn max_tier() -> crate::models::Tier {
crate::models::Tier::Deep
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::graph::builder::GraphBuilder;
#[test]
fn test_detects_duplicate_code_blocks() {
let block = "def calculate_total(items):\n total = 0\n for item in items:\n price = item.get_price()\n total += price * item.quantity\n return total\n\ndef wrapper():\n pass\n";
let store = GraphBuilder::new().freeze();
let detector = DuplicateCodeDetector::new("/mock/repo");
let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
&store,
vec![("billing.py", block), ("reporting.py", block)],
);
let findings = detector.detect(&ctx).expect("detection should succeed");
assert!(
!findings.is_empty(),
"Should detect duplicate code blocks across two files"
);
assert!(
findings[0].title.contains("Duplicate"),
"Title should mention duplicate, got: {}",
findings[0].title
);
}
#[test]
fn test_no_finding_for_unique_code() {
let store = GraphBuilder::new().freeze();
let detector = DuplicateCodeDetector::new("/mock/repo");
let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
("module_a.py", "def alpha():\n x = 1\n y = 2\n z = 3\n w = 4\n return x + y + z + w\n"),
("module_b.py", "def beta():\n a = 10\n b = 20\n c = 30\n d = 40\n return a * b * c * d\n"),
]);
let findings = detector.detect(&ctx).expect("detection should succeed");
assert!(
findings.is_empty(),
"Should not flag unique code blocks, but got: {:?}",
findings.iter().map(|f| &f.title).collect::<Vec<_>>()
);
}
#[test]
fn test_skips_test_file_duplicates() {
let block = "def calculate_total(items):\n total = 0\n for item in items:\n price = item.get_price()\n total += price * item.quantity\n return total\n\ndef wrapper():\n pass\n";
let store = GraphBuilder::new().freeze();
let detector = DuplicateCodeDetector::new("/mock/repo");
let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
&store,
vec![
("tests/test_billing.py", block),
("tests/test_reporting.py", block),
],
);
let findings = detector.detect(&ctx).expect("detection should succeed");
assert!(
findings.is_empty(),
"Should skip duplicates when both files are test files, got {} findings",
findings.len()
);
}
#[test]
fn test_skips_generated_files() {
let block = "def calculate_total(items):\n total = 0\n for item in items:\n price = item.get_price()\n total += price * item.quantity\n return total\n\ndef wrapper():\n pass\n";
let store = GraphBuilder::new().freeze();
let detector = DuplicateCodeDetector::new("/mock/repo");
let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
&store,
vec![
("generated/billing.py", block),
("generated/reporting.py", block),
],
);
let findings = detector.detect(&ctx).expect("detection should succeed");
assert!(
findings.is_empty(),
"Should skip duplicates in generated directories, got {} findings",
findings.len()
);
}
#[test]
fn test_skips_fixture_files() {
let block = "def calculate_total(items):\n total = 0\n for item in items:\n price = item.get_price()\n total += price * item.quantity\n return total\n\ndef wrapper():\n pass\n";
let store = GraphBuilder::new().freeze();
let detector = DuplicateCodeDetector::new("/mock/repo");
let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
&store,
vec![
("fixtures/billing.py", block),
("fixtures/reporting.py", block),
],
);
let findings = detector.detect(&ctx).expect("detection should succeed");
assert!(
findings.is_empty(),
"Should skip duplicates in fixture directories, got {} findings",
findings.len()
);
}
#[test]
fn test_is_trait_impl() {
assert!(DuplicateCodeDetector::is_trait_impl(
"src/detectors/god_class.rs::impl<Detector for GodClassDetector>::detect:42"
));
assert!(!DuplicateCodeDetector::is_trait_impl(
"src/detectors/god_class.rs::impl<GodClassDetector>::new:10"
));
assert!(!DuplicateCodeDetector::is_trait_impl(
"src/detectors/god_class.rs::calculate_score:100"
));
}
#[test]
fn test_deduplicate_overlapping_locations() {
let path = PathBuf::from("src/foo.rs");
let locations = vec![
(path.clone(), 10),
(path.clone(), 11),
(path.clone(), 12),
(path.clone(), 20), ];
let deduped = DuplicateCodeDetector::deduplicate_locations(&locations, 6);
assert_eq!(deduped.len(), 2);
assert_eq!(deduped[0].1, 10);
assert_eq!(deduped[1].1, 20);
}
#[test]
fn test_deduplicate_different_files() {
let locations = vec![
(PathBuf::from("src/a.rs"), 10),
(PathBuf::from("src/a.rs"), 11),
(PathBuf::from("src/b.rs"), 10),
(PathBuf::from("src/b.rs"), 11),
];
let deduped = DuplicateCodeDetector::deduplicate_locations(&locations, 6);
assert_eq!(deduped.len(), 2);
}
#[test]
fn test_skip_path_segments() {
assert!(DuplicateCodeDetector::is_skip_path(std::path::Path::new(
"src/generated/foo.py"
)));
assert!(DuplicateCodeDetector::is_skip_path(std::path::Path::new(
"db/migrations/001.py"
)));
assert!(DuplicateCodeDetector::is_skip_path(std::path::Path::new(
"vendor/lib.go"
)));
assert!(DuplicateCodeDetector::is_skip_path(std::path::Path::new(
"api/proto/service.py"
)));
assert!(DuplicateCodeDetector::is_skip_path(std::path::Path::new(
"tests/fixtures/data.py"
)));
assert!(!DuplicateCodeDetector::is_skip_path(std::path::Path::new(
"src/billing.py"
)));
}
}