use crate::detectors::base::{Detector, DetectorConfig};
use crate::graph::GraphQueryExt;
use crate::models::{Finding, Severity};
use anyhow::Result;
use std::collections::{HashMap, HashSet};
use std::path::PathBuf;
use tracing::{debug, info};
#[derive(Debug, Clone)]
pub struct DataClumpsThresholds {
pub min_params: usize,
pub min_occurrences: usize,
}
impl Default for DataClumpsThresholds {
fn default() -> Self {
Self {
min_params: 4,
min_occurrences: 5,
}
}
}
const FRAMEWORK_CONVENTIONS: &[&[&str]] = &[
&["req", "res", "next"],
&["ctx", "w", "r"],
&["self", "other"],
&["request", "response"],
&["app", "req", "res"],
&["t", "ctx"],
&["db", "ctx"],
&["conn", "ctx"],
];
fn suggest_struct_name(params: &[String]) -> String {
let param_set: HashSet<&str> = params.iter().map(|s| s.as_str()).collect();
let patterns: &[(&[&str], &str)] = &[
(&["x", "y"], "Point"),
(&["x", "y", "z"], "Point3D"),
(&["width", "height"], "Size"),
(&["start", "end"], "Range"),
(&["min", "max"], "Range"),
(&["host", "port"], "Address"),
(&["first_name", "last_name"], "Name"),
(&["first_name", "last_name", "email"], "PersonInfo"),
(&["latitude", "longitude"], "Coordinates"),
(&["lat", "lng"], "Coordinates"),
(&["red", "green", "blue"], "RGB"),
(&["r", "g", "b"], "RGB"),
(&["username", "password"], "Credentials"),
(&["user", "password"], "Credentials"),
(&["path", "filename"], "FilePath"),
(&["name", "email"], "Contact"),
(&["start_date", "end_date"], "DateRange"),
(&["created_at", "updated_at"], "Timestamps"),
];
for (pattern_params, name) in patterns {
let pattern_set: HashSet<&str> = pattern_params.iter().copied().collect();
if pattern_set.is_subset(¶m_set) {
return name.to_string();
}
}
if let Some(first) = params.first() {
let base: String = first
.split('_')
.map(|w| {
let mut c = w.chars();
match c.next() {
None => String::new(),
Some(f) => f.to_uppercase().chain(c).collect(),
}
})
.collect();
return format!("{}Params", base);
}
"ParamGroup".to_string()
}
pub struct DataClumpsDetector {
config: DetectorConfig,
thresholds: DataClumpsThresholds,
}
impl DataClumpsDetector {
pub fn new() -> Self {
Self {
config: DetectorConfig::new(),
thresholds: DataClumpsThresholds::default(),
}
}
pub fn with_config(config: DetectorConfig) -> Self {
let thresholds = DataClumpsThresholds {
min_params: config.get_option_or("min_params", 4),
min_occurrences: config.get_option_or("min_occurrences", 5),
};
Self { config, thresholds }
}
fn is_framework_convention(params: &[String]) -> bool {
let param_set: HashSet<&str> = params.iter().map(|s| s.as_str()).collect();
for pattern in FRAMEWORK_CONVENTIONS {
let pattern_set: HashSet<&str> = pattern.iter().copied().collect();
if param_set.is_subset(&pattern_set) {
return true;
}
}
false
}
fn extract_params(
&self,
func: &crate::graph::CodeNode,
graph: &dyn crate::graph::GraphQuery,
) -> Vec<String> {
let i = graph.interner();
if let Some(params_key) = graph
.extra_props(func.qualified_name)
.and_then(|ep| ep.params)
{
let params_str = i.resolve(params_key);
return params_str
.split(',')
.map(|p| p.trim().to_lowercase())
.filter(|p| !p.is_empty() && !p.starts_with('_') && p != "self" && p != "this")
.collect();
}
if func.param_count > 0 && (func.param_count as usize) >= self.thresholds.min_params {
}
vec![]
}
fn find_clumps(&self, graph: &dyn crate::graph::GraphQuery) -> Vec<DataClump> {
let i = graph.interner();
let functions = graph.get_functions_shared();
let mut func_data: Vec<(FuncInfo, HashSet<String>)> = Vec::new();
for func in functions.iter() {
let params = self.extract_params(func, graph);
if params.len() < self.thresholds.min_params {
continue;
}
func_data.push((
FuncInfo {
name: func.node_name(i).to_string(),
qualified_name: func.qn(i).to_string(),
file: func.path(i).to_string(),
line: func.line_start,
},
params.into_iter().collect(),
));
}
debug!(
"DataClumps: {} functions with {}+ params",
func_data.len(),
self.thresholds.min_params
);
if func_data.len() < self.thresholds.min_occurrences {
return vec![];
}
let mut param_index: HashMap<&str, Vec<usize>> = HashMap::new();
for (idx, (_, params)) in func_data.iter().enumerate() {
for param in params {
param_index.entry(param.as_str()).or_default().push(idx);
}
}
let mut seen_pairs: HashSet<(usize, usize)> = HashSet::new();
let mut clump_map: HashMap<Vec<String>, HashSet<usize>> = HashMap::new();
for indices in param_index.values() {
if indices.len() < 2 {
continue;
}
for (i_pos, &i) in indices.iter().enumerate() {
for &j in &indices[i_pos + 1..] {
let key = if i < j { (i, j) } else { (j, i) };
if !seen_pairs.insert(key) {
continue;
}
let mut shared: Vec<String> = func_data[key.0]
.1
.intersection(&func_data[key.1].1)
.cloned()
.collect();
if shared.len() >= self.thresholds.min_params {
shared.sort();
let entry = clump_map.entry(shared).or_default();
entry.insert(key.0);
entry.insert(key.1);
}
}
}
}
let funcs_in_clumps: HashSet<usize> = clump_map
.values()
.flat_map(|indices| indices.iter().copied())
.collect();
let callees_map: HashMap<&str, HashSet<String>> = funcs_in_clumps
.iter()
.map(|&idx| {
let qn = func_data[idx].0.qualified_name.as_str();
let callees: HashSet<String> = graph
.get_callees(qn)
.iter()
.map(|c| c.qn(i).to_string())
.collect();
(qn, callees)
})
.collect();
let mut clumps: Vec<DataClump> = clump_map
.into_iter()
.filter(|(params, func_indices)| {
func_indices.len() >= self.thresholds.min_occurrences
&& !Self::is_framework_convention(params)
})
.map(|(params, func_indices)| {
let funcs: Vec<FuncInfo> = func_indices
.iter()
.map(|&idx| func_data[idx].0.clone())
.collect();
let (call_count, is_chain) =
self.analyze_call_relationships_cached(&callees_map, &funcs);
DataClump {
params,
funcs,
call_relationships: call_count,
is_call_chain: is_chain,
}
})
.collect();
clumps = self.remove_subsets(clumps);
clumps.sort_by(|a, b| {
b.call_relationships
.cmp(&a.call_relationships)
.then(b.funcs.len().cmp(&a.funcs.len()))
});
clumps
}
fn analyze_call_relationships_cached(
&self,
callees_map: &HashMap<&str, HashSet<String>>,
funcs: &[FuncInfo],
) -> (usize, bool) {
let func_qns: HashSet<&str> = funcs.iter().map(|f| f.qualified_name.as_str()).collect();
let mut call_count = 0;
let mut has_chain = false;
for func in funcs {
if let Some(callees) = callees_map.get(func.qualified_name.as_str()) {
for callee_qn in callees {
if func_qns.contains(callee_qn.as_str()) {
call_count += 1;
if let Some(callee_callees) = callees_map.get(callee_qn.as_str()) {
for cc_qn in callee_callees {
if func_qns.contains(cc_qn.as_str())
&& *cc_qn != func.qualified_name
{
has_chain = true;
}
}
}
}
}
}
}
(call_count, has_chain)
}
fn remove_subsets(&self, mut clumps: Vec<DataClump>) -> Vec<DataClump> {
clumps.sort_by_key(|c| std::cmp::Reverse(c.params.len()));
let mut result: Vec<DataClump> = Vec::new();
for clump in clumps {
let param_set: HashSet<&String> = clump.params.iter().collect();
let is_subset = result.iter().any(|existing: &DataClump| {
let existing_params: HashSet<&String> = existing.params.iter().collect();
param_set.len() < existing_params.len()
&& param_set.is_subset(&existing_params)
&& existing.funcs.len() >= clump.funcs.len()
});
if !is_subset {
result.push(clump);
}
}
result
}
fn calculate_severity(&self, clump: &DataClump) -> Severity {
let func_count = clump.funcs.len();
let call_rels = clump.call_relationships;
let base = if func_count >= 6 {
Severity::High
} else if func_count >= 4 {
Severity::Medium
} else {
Severity::Low
};
if clump.is_call_chain {
return match base {
Severity::Low => Severity::Medium,
Severity::Medium => Severity::High,
_ => Severity::High,
};
}
if call_rels >= 3 {
return match base {
Severity::Low => Severity::Medium,
_ => base,
};
}
base
}
}
struct DataClump {
params: Vec<String>,
funcs: Vec<FuncInfo>,
call_relationships: usize,
is_call_chain: bool,
}
#[derive(Clone)]
struct FuncInfo {
name: String,
qualified_name: String,
file: String,
line: u32,
}
#[cfg(test)]
fn combinations(items: &[String], k: usize) -> Vec<Vec<String>> {
if k > items.len() {
return vec![];
}
if k == items.len() {
return vec![items.to_vec()];
}
if k == 0 {
return vec![vec![]];
}
let mut result = Vec::new();
let mut indices: Vec<usize> = (0..k).collect();
let n = items.len();
loop {
result.push(indices.iter().map(|&i| items[i].clone()).collect());
let mut i = k;
while i > 0 {
i -= 1;
if indices[i] < n - k + i {
break;
}
}
if i == 0 && indices[0] >= n - k {
break;
}
indices[i] += 1;
for j in (i + 1)..k {
indices[j] = indices[j - 1] + 1;
}
}
result
}
impl Default for DataClumpsDetector {
fn default() -> Self {
Self::new()
}
}
impl Detector for DataClumpsDetector {
fn name(&self) -> &'static str {
"DataClumpsDetector"
}
fn description(&self) -> &'static str {
"Detects parameter groups that appear together across functions"
}
fn category(&self) -> &'static str {
"code_smell"
}
fn config(&self) -> Option<&DetectorConfig> {
Some(&self.config)
}
fn detect(
&self,
ctx: &crate::detectors::analysis_context::AnalysisContext,
) -> Result<Vec<Finding>> {
let graph = ctx.graph;
let mut findings = Vec::new();
let clumps = self.find_clumps(graph);
for clump in clumps {
let severity = self.calculate_severity(&clump);
let struct_name = suggest_struct_name(&clump.params);
let params_str = clump.params.join(", ");
let func_list: String = clump
.funcs
.iter()
.take(5)
.map(|f| format!(" - {} ({}:{})", f.name, f.file, f.line))
.collect::<Vec<_>>()
.join("\n");
let more_note = if clump.funcs.len() > 5 {
format!("\n ... and {} more functions", clump.funcs.len() - 5)
} else {
String::new()
};
let call_info = if clump.is_call_chain {
"\n\n⚠️ **Call chain detected**: These parameters travel through a call chain, \
making refactoring especially valuable."
.to_string()
} else if clump.call_relationships > 0 {
format!(
"\n\n📞 **{} call relationships** found between these functions.",
clump.call_relationships
)
} else {
String::new()
};
let mut files: Vec<PathBuf> = clump
.funcs
.iter()
.map(|f| PathBuf::from(&f.file))
.collect::<HashSet<_>>()
.into_iter()
.collect();
files.sort();
findings.push(Finding {
id: String::new(),
detector: "DataClumpsDetector".to_string(),
severity,
title: format!("Data clump: ({})", params_str),
description: format!(
"Parameters **({})** appear together in **{} functions**.\n\n\
This suggests a missing abstraction - consider extracting a `{}` struct.\n\n\
**Affected functions:**\n{}{}{}",
params_str,
clump.funcs.len(),
struct_name,
func_list,
more_note,
call_info
),
affected_files: files,
line_start: clump.funcs.first().map(|f| f.line),
line_end: None,
suggested_fix: Some(format!(
"Extract parameters into a struct:\n\n\
```rust\n\
struct {} {{\n\
{}\n\
}}\n\
```\n\n\
Then refactor functions to accept `{}` instead of {} separate parameters.{}",
struct_name,
clump.params.iter().map(|p| format!(" {}: Type,", p)).collect::<Vec<_>>().join("\n"),
struct_name,
clump.params.len(),
if clump.is_call_chain {
"\n\nSince these functions call each other, the refactoring can be done incrementally."
} else { "" }
)),
estimated_effort: Some(if clump.funcs.len() >= 6 || clump.is_call_chain {
"Large (2-4 hours)".to_string()
} else {
"Medium (1-2 hours)".to_string()
}),
category: Some("code_smell".to_string()),
cwe_id: None,
why_it_matters: Some(
"Data clumps indicate missing abstractions. Grouping related parameters \
into a struct improves readability, type safety, and makes changes easier."
.to_string()
),
..Default::default()
});
}
info!(
"DataClumpsDetector found {} findings (graph-aware)",
findings.len()
);
Ok(findings)
}
}
impl crate::detectors::RegisteredDetector for DataClumpsDetector {
fn create(init: &crate::detectors::DetectorInit) -> std::sync::Arc<dyn Detector> {
std::sync::Arc::new(Self::with_config(init.config_for("DataClumpsDetector")))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_suggest_struct_name() {
assert_eq!(
suggest_struct_name(&["x".to_string(), "y".to_string()]),
"Point"
);
assert_eq!(
suggest_struct_name(&["host".to_string(), "port".to_string()]),
"Address"
);
assert_eq!(
suggest_struct_name(&["foo".to_string(), "bar".to_string(), "baz".to_string()]),
"FooParams"
);
}
#[test]
fn test_combinations() {
let items = vec!["a".to_string(), "b".to_string(), "c".to_string()];
let combos = combinations(&items, 2);
assert_eq!(combos.len(), 3); }
#[test]
fn test_severity() {
let detector = DataClumpsDetector::new();
let clump_3 = DataClump {
params: vec!["a".to_string(), "b".to_string(), "c".to_string()],
funcs: vec![
FuncInfo {
name: "f1".to_string(),
qualified_name: "mod::f1".to_string(),
file: "a.rs".to_string(),
line: 1,
},
FuncInfo {
name: "f2".to_string(),
qualified_name: "mod::f2".to_string(),
file: "a.rs".to_string(),
line: 10,
},
FuncInfo {
name: "f3".to_string(),
qualified_name: "mod::f3".to_string(),
file: "a.rs".to_string(),
line: 20,
},
],
call_relationships: 0,
is_call_chain: false,
};
assert_eq!(detector.calculate_severity(&clump_3), Severity::Low);
let clump_4 = DataClump {
params: vec!["a".to_string(), "b".to_string(), "c".to_string()],
funcs: vec![
FuncInfo {
name: "f1".to_string(),
qualified_name: "mod::f1".to_string(),
file: "a.rs".to_string(),
line: 1,
},
FuncInfo {
name: "f2".to_string(),
qualified_name: "mod::f2".to_string(),
file: "a.rs".to_string(),
line: 10,
},
FuncInfo {
name: "f3".to_string(),
qualified_name: "mod::f3".to_string(),
file: "a.rs".to_string(),
line: 20,
},
FuncInfo {
name: "f4".to_string(),
qualified_name: "mod::f4".to_string(),
file: "a.rs".to_string(),
line: 30,
},
],
call_relationships: 0,
is_call_chain: false,
};
assert_eq!(detector.calculate_severity(&clump_4), Severity::Medium);
let clump_6 = DataClump {
params: vec!["a".to_string(), "b".to_string(), "c".to_string()],
funcs: vec![
FuncInfo {
name: "f1".to_string(),
qualified_name: "mod::f1".to_string(),
file: "a.rs".to_string(),
line: 1,
},
FuncInfo {
name: "f2".to_string(),
qualified_name: "mod::f2".to_string(),
file: "a.rs".to_string(),
line: 10,
},
FuncInfo {
name: "f3".to_string(),
qualified_name: "mod::f3".to_string(),
file: "a.rs".to_string(),
line: 20,
},
FuncInfo {
name: "f4".to_string(),
qualified_name: "mod::f4".to_string(),
file: "a.rs".to_string(),
line: 30,
},
FuncInfo {
name: "f5".to_string(),
qualified_name: "mod::f5".to_string(),
file: "a.rs".to_string(),
line: 40,
},
FuncInfo {
name: "f6".to_string(),
qualified_name: "mod::f6".to_string(),
file: "a.rs".to_string(),
line: 50,
},
],
call_relationships: 0,
is_call_chain: false,
};
assert_eq!(detector.calculate_severity(&clump_6), Severity::High);
let clump_chain = DataClump {
params: vec!["a".to_string(), "b".to_string(), "c".to_string()],
funcs: vec![
FuncInfo {
name: "f1".to_string(),
qualified_name: "mod::f1".to_string(),
file: "a.rs".to_string(),
line: 1,
},
FuncInfo {
name: "f2".to_string(),
qualified_name: "mod::f2".to_string(),
file: "a.rs".to_string(),
line: 10,
},
FuncInfo {
name: "f3".to_string(),
qualified_name: "mod::f3".to_string(),
file: "a.rs".to_string(),
line: 20,
},
],
call_relationships: 2,
is_call_chain: true, };
assert_eq!(detector.calculate_severity(&clump_chain), Severity::Medium);
}
#[test]
fn test_framework_convention_skipped() {
assert!(DataClumpsDetector::is_framework_convention(&[
"req".to_string(),
"res".to_string(),
"next".to_string(),
]));
assert!(DataClumpsDetector::is_framework_convention(&[
"ctx".to_string(),
"w".to_string(),
"r".to_string(),
]));
assert!(DataClumpsDetector::is_framework_convention(&[
"req".to_string(),
"res".to_string(),
]));
assert!(!DataClumpsDetector::is_framework_convention(&[
"user_id".to_string(),
"email".to_string(),
"name".to_string(),
]));
}
#[test]
fn test_subset_deduplication() {
let detector = DataClumpsDetector::new();
let make_funcs = |n: usize| -> Vec<FuncInfo> {
(1..=n)
.map(|i| FuncInfo {
name: format!("f{}", i),
qualified_name: format!("mod::f{}", i),
file: "a.rs".to_string(),
line: i as u32,
})
.collect()
};
let superset = DataClump {
params: vec![
"a".to_string(),
"b".to_string(),
"c".to_string(),
"d".to_string(),
],
funcs: make_funcs(5),
call_relationships: 0,
is_call_chain: false,
};
let subset = DataClump {
params: vec!["a".to_string(), "b".to_string(), "c".to_string()],
funcs: make_funcs(5),
call_relationships: 0,
is_call_chain: false,
};
let result = detector.remove_subsets(vec![superset, subset]);
assert_eq!(result.len(), 1);
assert_eq!(result[0].params.len(), 4, "only the superset should remain");
let subset_more_occ = DataClump {
params: vec!["a".to_string(), "b".to_string(), "c".to_string()],
funcs: make_funcs(8),
call_relationships: 0,
is_call_chain: false,
};
let superset2 = DataClump {
params: vec![
"a".to_string(),
"b".to_string(),
"c".to_string(),
"d".to_string(),
],
funcs: make_funcs(5),
call_relationships: 0,
is_call_chain: false,
};
let result2 = detector.remove_subsets(vec![superset2, subset_more_occ]);
assert_eq!(
result2.len(),
2,
"subset with more occurrences should be kept"
);
}
}