use std::collections::HashMap;
use std::fmt::Write as _;
use std::path::{Path, PathBuf};
use serde::{Deserialize, Serialize};
use tree_sitter::TreeCursor;
use crate::core::config::Config;
use crate::core::finding::{Finding, IntoFindings, Location};
use crate::core::severity::Severity;
use crate::feature::{decorate, Feature, FeatureKind, FeatureMeta, HotspotIndex};
use crate::observer::complexity::{parse, ParsedFile};
use crate::observer::lang::Language;
use crate::observer::walk::walk_supported_files;
use crate::observer::{ObservationMeta, Observer};
use crate::observers::ObserverReports;
const HASH_BASE: u64 = 0x100_0000_01b3;
const FNV_OFFSET: u64 = 0xcbf2_9ce4_8422_2325;
#[derive(Debug, Clone, Default)]
pub struct DuplicationObserver {
pub enabled: bool,
pub excluded: Vec<String>,
pub min_tokens: u32,
}
impl DuplicationObserver {
#[must_use]
pub fn from_config(cfg: &Config) -> Self {
Self {
enabled: cfg.metrics.duplication.enabled,
excluded: cfg.observer_excluded_paths(),
min_tokens: cfg.metrics.duplication.min_tokens,
}
}
#[must_use]
pub fn scan(&self, root: &Path) -> DuplicationReport {
let mut report = DuplicationReport {
min_tokens: self.min_tokens,
..DuplicationReport::default()
};
if !self.enabled || self.min_tokens == 0 {
return report;
}
let window = self.min_tokens as usize;
let mut files: Vec<FileTokens> = Vec::new();
for path in walk_supported_files(root, &self.excluded) {
let Some(lang) = Language::from_path(&path) else {
continue;
};
let Ok(source) = std::fs::read_to_string(&path) else {
continue;
};
let Ok(parsed) = parse(source, lang) else {
continue;
};
let (hashes, lines) = collect_tokens(&parsed);
if hashes.len() < window {
continue;
}
let rel = path
.strip_prefix(root)
.map(Path::to_path_buf)
.unwrap_or(path);
files.push(FileTokens {
path: rel,
hashes,
lines,
});
}
if files.is_empty() {
return report;
}
let report_blocks = assemble_report_blocks(detect_blocks(&files, window), &files);
let (files_summary, affected_count, duplicate_tokens) =
file_level_rollup(&files, &report_blocks);
report.totals = DuplicationTotals {
duplicate_blocks: report_blocks.len(),
duplicate_tokens,
files_affected: affected_count,
};
report.blocks = report_blocks;
report.files = files_summary;
report
}
}
fn assemble_report_blocks(blocks: Vec<InternalBlock>, files: &[FileTokens]) -> Vec<DuplicateBlock> {
let mut report_blocks: Vec<DuplicateBlock> = blocks
.into_iter()
.map(|b| {
let locations = b
.locations
.into_iter()
.map(|(fi, start)| {
let f = &files[fi];
let start_line = f.lines.get(start).copied().unwrap_or(0);
let end_line = f
.lines
.get(start + b.token_count - 1)
.copied()
.unwrap_or(start_line);
DuplicateLocation {
path: f.path.clone(),
start_line,
end_line,
}
})
.collect::<Vec<_>>();
let mut block = DuplicateBlock {
token_count: u32::try_from(b.token_count).unwrap_or(u32::MAX),
locations,
};
block.locations.sort_by(|x, y| {
x.path
.cmp(&y.path)
.then_with(|| x.start_line.cmp(&y.start_line))
});
block
})
.collect();
report_blocks.sort_by(|a, b| {
b.token_count
.cmp(&a.token_count)
.then_with(|| a.locations[0].path.cmp(&b.locations[0].path))
.then_with(|| a.locations[0].start_line.cmp(&b.locations[0].start_line))
});
report_blocks
}
fn file_level_rollup(
files: &[FileTokens],
blocks: &[DuplicateBlock],
) -> (Vec<FileDuplication>, usize, usize) {
let mut dup_tokens_by_file: std::collections::BTreeMap<PathBuf, u32> =
std::collections::BTreeMap::new();
let mut duplicate_tokens: usize = 0;
for b in blocks {
duplicate_tokens += b.token_count as usize * b.locations.len();
for loc in &b.locations {
let entry = dup_tokens_by_file.entry(loc.path.clone()).or_insert(0);
*entry = entry.saturating_add(b.token_count);
}
}
let mut summary: Vec<FileDuplication> = files
.iter()
.map(|f| {
let total_tokens = u32::try_from(f.hashes.len()).unwrap_or(u32::MAX);
let dt = dup_tokens_by_file
.get(&f.path)
.copied()
.unwrap_or(0)
.min(total_tokens);
let pct = if total_tokens == 0 {
0.0
} else {
f64::from(dt) / f64::from(total_tokens) * 100.0
};
FileDuplication {
path: f.path.clone(),
total_tokens,
duplicate_tokens: dt,
duplicate_pct: pct,
}
})
.collect();
summary.sort_by(|a, b| a.path.cmp(&b.path));
(summary, dup_tokens_by_file.len(), duplicate_tokens)
}
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
pub struct DuplicationReport {
pub blocks: Vec<DuplicateBlock>,
#[serde(default)]
pub files: Vec<FileDuplication>,
pub totals: DuplicationTotals,
pub min_tokens: u32,
}
impl Eq for DuplicationReport {}
impl DuplicationReport {
#[must_use]
pub fn worst_n_blocks(&self, n: usize) -> Vec<DuplicateBlock> {
let mut top = self.blocks.clone();
top.truncate(n);
top
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
pub struct DuplicationTotals {
pub duplicate_blocks: usize,
pub duplicate_tokens: usize,
pub files_affected: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct DuplicateBlock {
pub token_count: u32,
pub locations: Vec<DuplicateLocation>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct DuplicateLocation {
pub path: PathBuf,
pub start_line: u32,
pub end_line: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct FileDuplication {
pub path: PathBuf,
pub total_tokens: u32,
pub duplicate_tokens: u32,
pub duplicate_pct: f64,
}
impl Eq for FileDuplication {}
impl Observer for DuplicationObserver {
type Output = DuplicationReport;
fn meta(&self) -> ObservationMeta {
ObservationMeta {
name: "duplication",
version: 1,
}
}
fn observe(&self, project_root: &Path) -> anyhow::Result<Self::Output> {
Ok(self.scan(project_root))
}
}
impl IntoFindings for DuplicationReport {
fn into_findings(&self) -> Vec<Finding> {
self.blocks
.iter()
.map(|block| {
let (head, rest) = block
.locations
.split_first()
.expect("DuplicateBlock always has ≥2 locations");
let primary = Location {
file: head.path.clone(),
line: Some(head.start_line),
symbol: None,
};
let mut seed = format!("dup:{}:", block.token_count);
let _ = write!(seed, "{}:{};", head.path.to_string_lossy(), head.start_line);
let mut extras: Vec<Location> = Vec::with_capacity(rest.len());
for l in rest {
let _ = write!(seed, "{}:{};", l.path.to_string_lossy(), l.start_line);
extras.push(Location {
file: l.path.clone(),
line: Some(l.start_line),
symbol: None,
});
}
let summary = format!(
"{} tokens duplicated across {} sites",
block.token_count,
1 + extras.len()
);
Finding::new("duplication", primary, summary, &seed).with_locations(extras)
})
.collect()
}
}
struct FileTokens {
path: PathBuf,
hashes: Vec<u64>,
lines: Vec<u32>,
}
struct InternalBlock {
token_count: usize,
locations: Vec<(usize, usize)>,
}
fn collect_tokens(parsed: &ParsedFile) -> (Vec<u64>, Vec<u32>) {
let mut hashes: Vec<u64> = Vec::new();
let mut lines: Vec<u32> = Vec::new();
let source = parsed.source.as_bytes();
let mut cursor: TreeCursor<'_> = parsed.tree.walk();
loop {
let node = cursor.node();
if let Some((hash, line)) = leaf_token(&node, source) {
hashes.push(hash);
lines.push(line);
}
if cursor.goto_first_child() {
continue;
}
loop {
if cursor.goto_next_sibling() {
break;
}
if !cursor.goto_parent() {
return (hashes, lines);
}
}
}
}
fn leaf_token(node: &tree_sitter::Node<'_>, source: &[u8]) -> Option<(u64, u32)> {
if node.child_count() != 0 || node.is_extra() || node.is_error() || node.is_missing() {
return None;
}
let text = node.utf8_text(source).ok()?;
if text.trim().is_empty() {
return None;
}
let mut h = FNV_OFFSET;
for b in node.kind_id().to_le_bytes() {
h = (h ^ u64::from(b)).wrapping_mul(HASH_BASE);
}
for b in text.as_bytes() {
h = (h ^ u64::from(*b)).wrapping_mul(HASH_BASE);
}
let line = u32::try_from(node.start_position().row + 1).unwrap_or(u32::MAX);
Some((h, line))
}
fn compute_window_hashes(tokens: &[u64], window: usize) -> Vec<u64> {
if tokens.len() < window || window == 0 {
return Vec::new();
}
let exp = u32::try_from(window).unwrap_or(u32::MAX).saturating_sub(1);
let base_pow = HASH_BASE.wrapping_pow(exp);
let mut out = Vec::with_capacity(tokens.len() - window + 1);
let mut h: u64 = 0;
for &t in &tokens[..window] {
h = h.wrapping_mul(HASH_BASE).wrapping_add(t);
}
out.push(h);
for k in window..tokens.len() {
let oldest = tokens[k - window];
h = h
.wrapping_sub(oldest.wrapping_mul(base_pow))
.wrapping_mul(HASH_BASE)
.wrapping_add(tokens[k]);
out.push(h);
}
out
}
fn detect_blocks(files: &[FileTokens], window: usize) -> Vec<InternalBlock> {
if window == 0 {
return Vec::new();
}
let window_hashes: Vec<Vec<u64>> = files
.iter()
.map(|f| compute_window_hashes(&f.hashes, window))
.collect();
let mut buckets: HashMap<u64, Vec<(usize, usize)>> = HashMap::new();
for (fi, hashes) in window_hashes.iter().enumerate() {
for (start, &h) in hashes.iter().enumerate() {
buckets.entry(h).or_default().push((fi, start));
}
}
let mut covered: Vec<Vec<bool>> = files
.iter()
.map(|f| vec![false; f.hashes.len().max(1)])
.collect();
let mut blocks: Vec<InternalBlock> = Vec::new();
for fi in 0..files.len() {
let n = files[fi].hashes.len();
if n < window {
continue;
}
let last_start = n - window;
for start in 0..=last_start {
if covered[fi][start] {
continue;
}
let h = window_hashes[fi][start];
let Some(bucket) = buckets.get(&h) else {
continue;
};
if bucket.len() < 2 {
continue;
}
let base_window = &files[fi].hashes[start..start + window];
let mut locs: Vec<(usize, usize)> = bucket
.iter()
.copied()
.filter(|(bf, bs)| {
if covered[*bf][*bs] {
return false;
}
let other = &files[*bf];
other.hashes.len() >= bs + window
&& &other.hashes[*bs..*bs + window] == base_window
})
.collect();
locs.sort_unstable();
locs.dedup();
if locs.len() < 2 {
continue;
}
let mut len = window;
loop {
let next = len;
let (rf, rs) = locs[0];
let Some(reference) = files[rf].hashes.get(rs + next).copied() else {
break;
};
let mut all_match = true;
for &(lf, ls) in locs.iter().skip(1) {
if files[lf].hashes.get(ls + next).copied() != Some(reference) {
all_match = false;
break;
}
}
if !all_match {
break;
}
len += 1;
}
let cover_count = len - window + 1;
for &(lf, ls) in &locs {
for c in ls..(ls + cover_count) {
if let Some(slot) = covered[lf].get_mut(c) {
*slot = true;
}
}
}
blocks.push(InternalBlock {
token_count: len,
locations: locs,
});
}
}
blocks
}
pub struct DuplicationFeature;
impl Feature for DuplicationFeature {
fn meta(&self) -> FeatureMeta {
FeatureMeta {
name: "duplication",
version: 1,
kind: FeatureKind::Observer,
}
}
fn enabled(&self, cfg: &Config) -> bool {
cfg.metrics.duplication.enabled
}
fn lower(
&self,
reports: &ObserverReports,
_cfg: &Config,
cal: &crate::core::calibration::Calibration,
hotspot: &HotspotIndex,
) -> Vec<Finding> {
let Some(dup) = reports.duplication.as_ref() else {
return Vec::new();
};
let cal_dup = cal.calibration.duplication.as_ref();
let pct_by_file: HashMap<&Path, f64> = dup
.files
.iter()
.map(|f| (f.path.as_path(), f.duplicate_pct))
.collect();
let mut out = Vec::with_capacity(dup.blocks.len());
for (block, finding) in dup.blocks.iter().zip(dup.into_findings()) {
let max_pct = block
.locations
.iter()
.filter_map(|l| pct_by_file.get(l.path.as_path()).copied())
.fold(0.0_f64, f64::max);
let severity = cal_dup.map_or(Severity::Ok, |c| c.classify(max_pct));
out.push(decorate(finding, severity, hotspot));
}
out
}
}