use super::{MergeRecord, ModuleSplit, Priority};
#[derive(Debug, Clone)]
pub struct SplitSizeConfig {
pub min_methods: usize,
pub min_lines: usize,
pub utility_min_methods: usize,
pub utility_cohesion_threshold: f64,
pub max_size_ratio: f64,
pub min_cohesion_score: f64,
pub min_merge_similarity: f64,
}
impl Default for SplitSizeConfig {
fn default() -> Self {
Self {
min_methods: 10,
min_lines: 150,
utility_min_methods: 5,
utility_cohesion_threshold: 0.7,
max_size_ratio: 2.0,
min_cohesion_score: 0.3,
min_merge_similarity: 0.4,
}
}
}
impl SplitSizeConfig {
pub fn is_viable_split(&self, split: &ModuleSplit) -> bool {
if is_utility_module(split) {
let cohesion = split.cohesion_score.unwrap_or(0.0);
if cohesion > self.utility_cohesion_threshold {
return split.method_count >= self.utility_min_methods;
}
}
split.method_count >= self.min_methods || split.estimated_lines >= self.min_lines
}
pub fn has_sufficient_cohesion(&self, split: &ModuleSplit) -> bool {
split
.cohesion_score
.map(|score| score >= self.min_cohesion_score)
.unwrap_or(true) }
}
pub fn validate_and_refine_splits(splits: Vec<ModuleSplit>) -> Vec<ModuleSplit> {
validate_and_refine_splits_with_config(splits, &SplitSizeConfig::default())
}
pub fn validate_and_refine_splits_with_config(
splits: Vec<ModuleSplit>,
config: &SplitSizeConfig,
) -> Vec<ModuleSplit> {
if splits.is_empty() {
return splits;
}
let splits_with_cohesion = splits
.into_iter()
.map(calculate_split_cohesion)
.collect::<Vec<_>>();
let (mut viable, undersized): (Vec<_>, Vec<_>) = splits_with_cohesion
.into_iter()
.partition(|s| config.is_viable_split(s));
for undersized_split in undersized {
if let Some(merge_target_idx) = find_best_merge_target(&undersized_split, &viable, config) {
viable[merge_target_idx] = merge_splits(
viable[merge_target_idx].clone(),
undersized_split,
config.min_merge_similarity,
);
}
}
viable.retain(|s| config.has_sufficient_cohesion(s));
let balanced = ensure_balanced_distribution(viable, config);
balanced
.into_iter()
.map(|split| prioritize_by_size(split, config))
.collect()
}
fn calculate_split_cohesion(mut split: ModuleSplit) -> ModuleSplit {
if split.cohesion_score.is_some() {
return split;
}
let cohesion = if split.methods_to_move.len() < 2 {
1.0 } else {
calculate_naming_cohesion(&split.methods_to_move)
};
split.cohesion_score = Some(cohesion);
split
}
fn calculate_naming_cohesion(methods: &[String]) -> f64 {
if methods.len() < 2 {
return 1.0;
}
let prefixes: Vec<String> = methods
.iter()
.filter_map(|m| extract_method_prefix(m))
.collect();
if prefixes.is_empty() {
return 0.5; }
let unique_prefixes: std::collections::HashSet<_> = prefixes.iter().collect();
let sharing_ratio = 1.0 - (unique_prefixes.len() as f64 / methods.len() as f64);
0.4 + (sharing_ratio * 0.6)
}
fn extract_method_prefix(method: &str) -> Option<String> {
if method.contains('_') {
return method.split('_').next().map(|s| s.to_lowercase());
}
for (i, c) in method.char_indices() {
if i > 0 && c.is_uppercase() {
return Some(method[..i].to_lowercase());
}
}
Some(method.to_lowercase())
}
fn find_best_merge_target(
undersized: &ModuleSplit,
viable_splits: &[ModuleSplit],
config: &SplitSizeConfig,
) -> Option<usize> {
if viable_splits.is_empty() {
return None;
}
viable_splits
.iter()
.enumerate()
.map(|(idx, split)| {
let similarity = calculate_semantic_similarity(undersized, split);
(idx, similarity)
})
.filter(|(_, sim)| *sim >= config.min_merge_similarity)
.max_by(|(_, sim1), (_, sim2)| sim1.partial_cmp(sim2).unwrap_or(std::cmp::Ordering::Equal))
.map(|(idx, _)| idx)
}
fn calculate_semantic_similarity(split1: &ModuleSplit, split2: &ModuleSplit) -> f64 {
let naming_sim = method_naming_similarity(split1, split2);
let responsibility_sim = responsibility_similarity(split1, split2);
let domain_sim = domain_similarity(split1, split2);
0.3 * naming_sim + 0.4 * responsibility_sim + 0.3 * domain_sim
}
fn method_naming_similarity(split1: &ModuleSplit, split2: &ModuleSplit) -> f64 {
let prefixes1: Vec<_> = split1
.methods_to_move
.iter()
.filter_map(|m| extract_method_prefix(m))
.collect();
let prefixes2: Vec<_> = split2
.methods_to_move
.iter()
.filter_map(|m| extract_method_prefix(m))
.collect();
if prefixes1.is_empty() || prefixes2.is_empty() {
return 0.0;
}
let set1: std::collections::HashSet<_> = prefixes1.iter().collect();
let set2: std::collections::HashSet<_> = prefixes2.iter().collect();
let intersection = set1.intersection(&set2).count();
let union = set1.union(&set2).count();
if union == 0 {
0.0
} else {
intersection as f64 / union as f64
}
}
fn responsibility_similarity(split1: &ModuleSplit, split2: &ModuleSplit) -> f64 {
let resp1 = split1.responsibility.to_lowercase();
let resp2 = split2.responsibility.to_lowercase();
if resp1 == resp2 {
1.0
} else if resp1.contains(&resp2) || resp2.contains(&resp1) {
0.7
} else {
0.0
}
}
fn domain_similarity(split1: &ModuleSplit, split2: &ModuleSplit) -> f64 {
if split1.domain.is_empty() || split2.domain.is_empty() {
return 0.5; }
let domain1 = split1.domain.to_lowercase();
let domain2 = split2.domain.to_lowercase();
if domain1 == domain2 {
1.0
} else if domain1.contains(&domain2) || domain2.contains(&domain1) {
0.6
} else {
0.0
}
}
fn merge_splits(
mut target: ModuleSplit,
source: ModuleSplit,
similarity_score: f64,
) -> ModuleSplit {
target
.methods_to_move
.extend(source.methods_to_move.clone());
target.structs_to_move.extend(source.structs_to_move);
target.method_count += source.method_count;
target.estimated_lines += source.estimated_lines;
target.merge_history.push(MergeRecord {
merged_from: source.suggested_name.clone(),
reason: format!(
"Merged {} ({} methods) due to size constraint",
source.suggested_name, source.method_count
),
similarity_score,
});
if !source.responsibility.is_empty() && target.responsibility != source.responsibility {
target.responsibility = format!("{} & {}", target.responsibility, source.responsibility);
}
target.cohesion_score = None;
calculate_split_cohesion(target)
}
fn is_distribution_balanced(splits: &[ModuleSplit], config: &SplitSizeConfig) -> bool {
let (min_size, max_size) = splits
.iter()
.map(|s| s.method_count)
.fold((usize::MAX, 0), |(min, max), count| {
(min.min(count), max.max(count))
});
min_size == 0 || max_size as f64 / min_size as f64 <= config.max_size_ratio
}
fn find_largest_split_idx(splits: &[ModuleSplit]) -> Option<usize> {
splits
.iter()
.enumerate()
.max_by_key(|(_, s)| s.method_count)
.map(|(idx, _)| idx)
}
enum RebalanceResult {
Rebalanced(Vec<ModuleSplit>),
CannotRebalance(Vec<ModuleSplit>),
}
fn try_rebalance_once(mut splits: Vec<ModuleSplit>) -> RebalanceResult {
let Some(idx) = find_largest_split_idx(&splits) else {
return RebalanceResult::CannotRebalance(splits);
};
let Some(sub_splits) = split_into_two(&splits[idx]) else {
return RebalanceResult::CannotRebalance(splits);
};
splits.remove(idx);
splits.extend(sub_splits);
RebalanceResult::Rebalanced(splits)
}
fn ensure_balanced_distribution(
mut splits: Vec<ModuleSplit>,
config: &SplitSizeConfig,
) -> Vec<ModuleSplit> {
const MAX_ITERATIONS: usize = 10;
if splits.len() < 2 {
return splits;
}
for _ in 0..MAX_ITERATIONS {
if is_distribution_balanced(&splits, config) {
return splits;
}
match try_rebalance_once(splits) {
RebalanceResult::Rebalanced(rebalanced) => splits = rebalanced,
RebalanceResult::CannotRebalance(unchanged) => return unchanged,
}
}
splits
}
fn split_into_two(split: &ModuleSplit) -> Option<Vec<ModuleSplit>> {
if split.method_count < 20 {
return None; }
let mid = split.methods_to_move.len() / 2;
let (first_half, second_half) = split.methods_to_move.split_at(mid);
let first_count = split.method_count / 2;
let second_count = split.method_count - first_count;
Some(vec![
ModuleSplit {
suggested_name: format!("{}_part1", split.suggested_name),
methods_to_move: first_half.to_vec(),
structs_to_move: vec![],
method_count: first_count,
estimated_lines: split.estimated_lines / 2,
priority: Priority::Medium,
warning: Some("Auto-split for balanced distribution".to_string()),
responsibility: split.responsibility.clone(),
cohesion_score: None,
merge_history: vec![],
..Default::default()
},
ModuleSplit {
suggested_name: format!("{}_part2", split.suggested_name),
methods_to_move: second_half.to_vec(),
structs_to_move: vec![],
method_count: second_count,
estimated_lines: split.estimated_lines - (split.estimated_lines / 2),
priority: Priority::Medium,
warning: Some("Auto-split for balanced distribution".to_string()),
responsibility: split.responsibility.clone(),
cohesion_score: None,
merge_history: vec![],
..Default::default()
},
])
}
fn is_utility_module(split: &ModuleSplit) -> bool {
let responsibility = split.responsibility.to_lowercase();
let domain = split.domain.to_lowercase();
responsibility.contains("data structure")
|| responsibility.contains("utilities")
|| responsibility.contains("helper")
|| domain.contains("utilities")
}
fn prioritize_by_size(mut split: ModuleSplit, _config: &SplitSizeConfig) -> ModuleSplit {
let method_count = split.method_count;
split.priority = if method_count <= 20 {
Priority::High } else if method_count <= 40 {
if split.warning.is_none() {
split.warning = Some(format!(
"{} methods is borderline - consider further splitting",
method_count
));
}
Priority::Medium
} else {
if split.warning.is_none() {
split.warning = Some(format!(
"{} methods is large for a single module",
method_count
));
}
Priority::Low
};
split
}
#[cfg(test)]
mod tests {
use super::*;
fn make_split(
name: &str,
method_count: usize,
methods: Vec<&str>,
responsibility: &str,
) -> ModuleSplit {
ModuleSplit {
suggested_name: name.to_string(),
methods_to_move: methods.into_iter().map(|s| s.to_string()).collect(),
structs_to_move: vec![],
responsibility: responsibility.to_string(),
estimated_lines: method_count * 15,
method_count,
warning: None,
priority: Priority::Medium,
cohesion_score: None,
merge_history: vec![],
..Default::default()
}
}
#[test]
fn test_reject_undersized_splits() {
let config = SplitSizeConfig::default();
let split = make_split("undersized", 3, vec!["m1", "m2", "m3"], "test");
assert!(!config.is_viable_split(&split));
}
#[test]
fn test_accept_valid_splits() {
let config = SplitSizeConfig::default();
let split = make_split(
"valid",
15,
vec![
"format_a", "format_b", "format_c", "format_d", "format_e", "format_f", "format_g",
"format_h", "format_i", "format_j", "format_k", "format_l", "format_m", "format_n",
"format_o",
],
"formatting",
);
assert!(config.is_viable_split(&split));
}
#[test]
fn test_utility_module_exception() {
let config = SplitSizeConfig::default();
let mut split = make_split(
"utility",
5,
vec!["new", "default", "clone", "eq", "hash"],
"data structure operations",
);
split.cohesion_score = Some(0.85);
assert!(config.is_viable_split(&split));
}
#[test]
fn test_semantic_similarity_high() {
let split1 = make_split(
"format_module",
10,
vec![
"format_item",
"format_header",
"format_footer",
"format_details",
],
"formatting",
);
let split2 = make_split(
"display_module",
10,
vec!["format_output", "format_table", "format_row"],
"formatting",
);
let similarity = calculate_semantic_similarity(&split1, &split2);
assert!(
similarity > 0.5,
"Expected high similarity, got {}",
similarity
);
}
#[test]
fn test_semantic_similarity_low() {
let split1 = make_split(
"format_module",
10,
vec!["format_item", "format_header"],
"formatting",
);
let split2 = make_split(
"validate_module",
10,
vec!["validate_input", "check_errors"],
"validation",
);
let similarity = calculate_semantic_similarity(&split1, &split2);
assert!(
similarity < 0.5,
"Expected low similarity, got {}",
similarity
);
}
#[test]
fn test_merge_splits() {
let target = make_split("target", 15, vec!["format_a", "format_b"], "formatting");
let source = make_split("source", 5, vec!["format_c"], "formatting");
let merged = merge_splits(target, source, 0.8);
assert_eq!(merged.method_count, 20);
assert_eq!(merged.methods_to_move.len(), 3);
assert_eq!(merged.merge_history.len(), 1);
assert_eq!(merged.merge_history[0].merged_from, "source");
assert_eq!(merged.merge_history[0].similarity_score, 0.8);
}
#[test]
fn test_naming_cohesion_high() {
let methods = vec![
"format_item".to_string(),
"format_header".to_string(),
"format_footer".to_string(),
];
let cohesion = calculate_naming_cohesion(&methods);
assert!(cohesion > 0.7, "Expected high cohesion, got {}", cohesion);
}
#[test]
fn test_naming_cohesion_low() {
let methods = vec![
"format_item".to_string(),
"validate_input".to_string(),
"parse_data".to_string(),
];
let cohesion = calculate_naming_cohesion(&methods);
assert!(cohesion < 0.7, "Expected low cohesion, got {}", cohesion);
}
#[test]
fn test_validate_and_refine_merges_undersized() {
let splits = vec![
make_split(
"large",
15,
vec!["format_a", "format_b", "format_c"],
"formatting",
),
make_split("tiny", 3, vec!["format_d"], "formatting"),
];
let config = SplitSizeConfig::default();
let refined = validate_and_refine_splits_with_config(splits, &config);
assert_eq!(refined.len(), 1);
assert!(refined[0].method_count >= 15);
assert!(!refined[0].merge_history.is_empty());
}
#[test]
fn test_balanced_distribution() {
let config = SplitSizeConfig::default();
let splits = vec![
make_split("huge", 80, vec![], "test"),
make_split("small", 10, vec![], "test"),
];
let balanced = ensure_balanced_distribution(splits, &config);
let sizes: Vec<_> = balanced.iter().map(|s| s.method_count).collect();
let max = *sizes.iter().max().unwrap();
let min = *sizes.iter().min().unwrap();
assert!(max as f64 / min as f64 <= config.max_size_ratio * 1.5); }
#[test]
fn test_extract_method_prefix() {
assert_eq!(
extract_method_prefix("format_output"),
Some("format".to_string())
);
assert_eq!(
extract_method_prefix("validateInput"),
Some("validate".to_string())
);
assert_eq!(extract_method_prefix("simple"), Some("simple".to_string()));
}
#[test]
fn test_is_utility_module() {
let mut split = make_split("util", 5, vec![], "data structure operations");
assert!(is_utility_module(&split));
split.responsibility = "formatting".to_string();
assert!(!is_utility_module(&split));
split.responsibility = "helper functions".to_string();
assert!(is_utility_module(&split));
}
#[test]
fn test_cohesion_validation() {
let config = SplitSizeConfig::default();
let mut split = make_split("low_cohesion", 15, vec![], "test");
split.cohesion_score = Some(0.2);
assert!(!config.has_sufficient_cohesion(&split));
split.cohesion_score = Some(0.5);
assert!(config.has_sufficient_cohesion(&split));
}
#[test]
fn test_balanced_distribution_single_split() {
let config = SplitSizeConfig::default();
let splits = vec![make_split("only_one", 50, vec![], "test")];
let balanced = ensure_balanced_distribution(splits.clone(), &config);
assert_eq!(balanced.len(), 1);
assert_eq!(balanced[0].suggested_name, "only_one");
}
#[test]
fn test_balanced_distribution_empty() {
let config = SplitSizeConfig::default();
let splits: Vec<ModuleSplit> = vec![];
let balanced = ensure_balanced_distribution(splits, &config);
assert!(balanced.is_empty());
}
#[test]
fn test_balanced_distribution_already_balanced() {
let config = SplitSizeConfig::default();
let splits = vec![
make_split("split_a", 30, vec![], "test"),
make_split("split_b", 25, vec![], "test"),
];
let balanced = ensure_balanced_distribution(splits, &config);
assert_eq!(balanced.len(), 2);
}
#[test]
fn test_balanced_distribution_unsplittable_large() {
let config = SplitSizeConfig::default();
let splits = vec![
make_split("unsplittable", 15, vec![], "test"), make_split("tiny", 5, vec![], "test"),
];
let balanced = ensure_balanced_distribution(splits, &config);
assert_eq!(balanced.len(), 2);
}
#[test]
fn test_balanced_distribution_with_zero_size() {
let config = SplitSizeConfig::default();
let splits = vec![
make_split("normal", 50, vec![], "test"),
make_split("empty", 0, vec![], "test"),
];
let balanced = ensure_balanced_distribution(splits, &config);
assert_eq!(balanced.len(), 2);
}
}