use std::collections::{HashMap, HashSet};
use std::path::Path;
use std::process::Command;
use serde::{Deserialize, Serialize};
use crate::types::Intent;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GitCase {
pub commit_sha: String,
pub timestamp: i64,
pub seed_file: String,
pub expected_related: Vec<String>,
pub message: String,
pub inferred_intent: Intent,
pub quality_weight: f64,
pub commit_file_count: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WeightedCase {
pub seed_file: String,
pub commit_sha: String,
pub inferred_intent: Intent,
pub expected_related: Vec<(String, f64)>,
pub case_weight: f64,
}
#[derive(Debug, Clone)]
struct RawCommit {
sha: String,
timestamp: i64,
message: String,
files: Vec<String>,
}
pub fn extract_cases(
repo: &Path,
max_commits: usize,
min_files: usize,
max_files: usize,
) -> Vec<GitCase> {
let commits = parse_git_log(repo, max_commits);
let mut cases = Vec::new();
for commit in commits {
if commit.files.len() < min_files || commit.files.len() > max_files {
continue;
}
let quality = compute_commit_quality(&commit);
if quality < 0.1 {
continue;
}
let intent = parse_intent(&commit.message);
for (i, seed) in commit.files.iter().enumerate() {
if !is_source_file(seed) {
continue;
}
let expected: Vec<_> = commit
.files
.iter()
.enumerate()
.filter(|(j, f)| *j != i && is_source_file(f))
.map(|(_, f)| f.clone())
.collect();
if expected.is_empty() {
continue;
}
cases.push(GitCase {
commit_sha: commit.sha.clone(),
timestamp: commit.timestamp,
seed_file: seed.clone(),
expected_related: expected,
message: commit.message.clone(),
inferred_intent: intent,
quality_weight: quality,
commit_file_count: commit.files.len(),
});
}
}
cases
}
pub fn compute_coupling_weights(repo: &Path, max_commits: usize) -> HashMap<(String, String), f64> {
let commits = parse_git_log(repo, max_commits);
let mut cochange_counts: HashMap<(String, String), usize> = HashMap::new();
let mut file_counts: HashMap<String, usize> = HashMap::new();
for commit in commits {
if commit.files.len() > 20 {
continue;
}
let source_files: Vec<_> = commit
.files
.iter()
.filter(|f| is_source_file(f))
.cloned()
.collect();
for f in &source_files {
*file_counts.entry(f.clone()).or_default() += 1;
}
for i in 0..source_files.len() {
for j in (i + 1)..source_files.len() {
let pair = normalize_pair(&source_files[i], &source_files[j]);
*cochange_counts.entry(pair).or_default() += 1;
}
}
}
cochange_counts
.into_iter()
.filter_map(|((a, b), co)| {
let ca = file_counts.get(&a)?;
let cb = file_counts.get(&b)?;
let union = ca + cb - co;
if union == 0 {
return None;
}
let jaccard = co as f64 / union as f64;
Some(((a, b), jaccard))
})
.collect()
}
pub fn weight_cases(
cases: Vec<GitCase>,
coupling: &HashMap<(String, String), f64>,
) -> Vec<WeightedCase> {
cases
.into_iter()
.map(|case| {
let weighted_expected: Vec<_> = case
.expected_related
.iter()
.map(|f| {
let pair = normalize_pair(&case.seed_file, f);
let weight = coupling.get(&pair).copied().unwrap_or(0.1);
(f.clone(), weight)
})
.collect();
let avg_coupling: f64 = if weighted_expected.is_empty() {
0.0
} else {
weighted_expected.iter().map(|(_, w)| w).sum::<f64>()
/ weighted_expected.len() as f64
};
WeightedCase {
seed_file: case.seed_file,
commit_sha: case.commit_sha,
inferred_intent: case.inferred_intent,
expected_related: weighted_expected,
case_weight: case.quality_weight * (0.5 + avg_coupling),
}
})
.collect()
}
pub fn cluster_into_sessions(cases: &[GitCase], session_gap_secs: i64) -> Vec<Vec<&GitCase>> {
if cases.is_empty() {
return vec![];
}
let mut sorted: Vec<_> = cases.iter().collect();
sorted.sort_by_key(|c| c.timestamp);
let mut sessions = Vec::new();
let mut current_session = vec![sorted[0]];
for case in sorted.into_iter().skip(1) {
let last_ts = current_session.last().unwrap().timestamp;
if case.timestamp - last_ts <= session_gap_secs {
current_session.push(case);
} else {
if !current_session.is_empty() {
sessions.push(current_session);
}
current_session = vec![case];
}
}
if !current_session.is_empty() {
sessions.push(current_session);
}
sessions
}
fn parse_git_log(repo: &Path, max_commits: usize) -> Vec<RawCommit> {
let output = Command::new("git")
.current_dir(repo)
.args([
"log",
&format!("-{}", max_commits),
"--pretty=format:%H|%ct|%s",
"--name-only",
])
.output();
let output = match output {
Ok(o) => o,
Err(e) => {
eprintln!("Failed to run git log: {}", e);
return vec![];
}
};
if !output.status.success() {
eprintln!(
"git log failed: {}",
String::from_utf8_lossy(&output.stderr)
);
return vec![];
}
let stdout = String::from_utf8_lossy(&output.stdout);
let mut commits = Vec::new();
let mut current_commit: Option<RawCommit> = None;
for line in stdout.lines() {
if line.is_empty() {
continue;
}
if line.contains('|') && line.len() >= 40 {
let parts: Vec<_> = line.splitn(3, '|').collect();
if parts.len() == 3 && parts[0].len() == 40 {
if let Some(commit) = current_commit.take() {
if !commit.files.is_empty() {
commits.push(commit);
}
}
let sha = parts[0].to_string();
let timestamp = parts[1].parse().unwrap_or(0);
let message = parts[2].to_string();
current_commit = Some(RawCommit {
sha,
timestamp,
message,
files: Vec::new(),
});
continue;
}
}
if let Some(ref mut commit) = current_commit {
commit.files.push(line.to_string());
}
}
if let Some(commit) = current_commit {
if !commit.files.is_empty() {
commits.push(commit);
}
}
commits
}
fn compute_commit_quality(commit: &RawCommit) -> f64 {
let mut weight: f64 = 1.0;
let msg = commit.message.to_lowercase();
if msg.starts_with("fix")
|| msg.contains("bugfix")
|| msg.contains("hotfix")
|| msg.starts_with("bug:")
{
weight *= 1.5;
}
else if msg.starts_with("feat")
|| msg.contains("implement")
|| msg.starts_with("add ")
|| msg.starts_with("add:")
{
weight *= 1.2;
}
else if msg.starts_with("refactor") || msg.contains("rename") || msg.contains("move ") {
weight *= 0.6;
}
if msg.contains("wip")
|| msg.contains("work in progress")
|| msg.contains("save")
|| msg.contains("checkpoint")
|| msg.contains("tmp")
{
weight *= 0.2;
}
if msg.contains("format")
|| msg.contains("lint")
|| msg.contains("prettier")
|| msg.contains("style:")
|| msg.contains("chore: format")
{
return 0.0; }
if msg.starts_with("merge") {
weight *= 0.3;
}
let n = commit.files.len();
if n <= 1 {
return 0.0; } else if n <= 6 {
weight *= 1.2; } else if n <= 10 {
weight *= 1.0; } else if n <= 15 {
weight *= 0.7; } else {
weight *= 0.4; }
let unique_dirs: HashSet<_> = commit
.files
.iter()
.filter_map(|f| Path::new(f).parent())
.filter_map(|p| p.to_str())
.collect();
if unique_dirs.len() >= 3 {
weight *= 1.2; }
let source_count = commit.files.iter().filter(|f| is_source_file(f)).count();
let source_ratio = source_count as f64 / n as f64;
if source_ratio < 0.3 {
weight *= 0.5; }
weight.min(2.0) }
fn parse_intent(message: &str) -> Intent {
let msg = message.to_lowercase();
if msg.contains("fix")
|| msg.contains("bug")
|| msg.contains("issue")
|| msg.contains("error")
|| msg.contains("crash")
|| msg.contains("debug")
{
Intent::Debug
} else if msg.contains("refactor")
|| msg.contains("clean")
|| msg.contains("rename")
|| msg.contains("reorganize")
|| msg.contains("restructure")
{
Intent::Refactor
} else if msg.contains("add")
|| msg.contains("implement")
|| msg.contains("feature")
|| msg.contains("new")
|| msg.contains("create")
|| msg.contains("support")
{
Intent::Extend
} else {
Intent::Explore
}
}
fn is_source_file(path: &str) -> bool {
let source_extensions = [
".rs", ".py", ".js", ".ts", ".jsx", ".tsx", ".mjs", ".cjs", ".go", ".c", ".h", ".cpp", ".hpp", ".cc", ".hh", ".java", ".kt", ".kts", ".rb", ".php", ".swift", ".cs", ".scala", ".ex", ".exs", ".hs", ".ml", ".mli", ".zig",
];
let has_source_ext = source_extensions.iter().any(|ext| path.ends_with(ext));
if !has_source_ext {
return false;
}
let vendor_patterns = [
"node_modules/",
"vendor/",
"third_party/",
"__pycache__/",
"target/",
".git/",
"dist/",
"build/",
];
!vendor_patterns.iter().any(|p| path.contains(p))
}
fn normalize_pair(a: &str, b: &str) -> (String, String) {
if a < b {
(a.to_string(), b.to_string())
} else {
(b.to_string(), a.to_string())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_source_file() {
assert!(is_source_file("src/main.rs"));
assert!(is_source_file("lib/parser.py"));
assert!(is_source_file("components/Button.tsx"));
assert!(!is_source_file("README.md"));
assert!(!is_source_file("package.json"));
assert!(!is_source_file("node_modules/lodash/index.js"));
}
#[test]
fn test_normalize_pair() {
assert_eq!(
normalize_pair("b.rs", "a.rs"),
("a.rs".to_string(), "b.rs".to_string())
);
assert_eq!(
normalize_pair("a.rs", "b.rs"),
("a.rs".to_string(), "b.rs".to_string())
);
}
#[test]
fn test_parse_intent() {
assert_eq!(parse_intent("fix: null pointer in parser"), Intent::Debug);
assert_eq!(
parse_intent("refactor: clean up auth module"),
Intent::Refactor
);
assert_eq!(parse_intent("add: new user registration"), Intent::Extend);
assert_eq!(parse_intent("update docs"), Intent::Explore);
}
#[test]
fn test_commit_quality_bugfix() {
let commit = RawCommit {
sha: "abc123".repeat(7),
timestamp: 0,
message: "fix: crash on empty input".to_string(),
files: vec!["src/parser.rs".to_string(), "src/input.rs".to_string()],
};
let quality = compute_commit_quality(&commit);
assert!(
quality > 1.0,
"Bugfix should have high quality: {}",
quality
);
}
#[test]
fn test_commit_quality_wip() {
let commit = RawCommit {
sha: "abc123".repeat(7),
timestamp: 0,
message: "WIP: still working on this".to_string(),
files: vec!["src/a.rs".to_string(), "src/b.rs".to_string()],
};
let quality = compute_commit_quality(&commit);
assert!(quality < 0.5, "WIP should have low quality: {}", quality);
}
#[test]
fn test_commit_quality_formatting() {
let commit = RawCommit {
sha: "abc123".repeat(7),
timestamp: 0,
message: "chore: format code with prettier".to_string(),
files: vec!["src/a.rs".to_string(), "src/b.rs".to_string()],
};
let quality = compute_commit_quality(&commit);
assert_eq!(quality, 0.0, "Formatting should be skipped");
}
#[test]
fn test_commit_quality_large_refactor() {
let commit = RawCommit {
sha: "abc123".repeat(7),
timestamp: 0,
message: "refactor: rename foo to bar everywhere".to_string(),
files: (0..20).map(|i| format!("src/file{}.rs", i)).collect(),
};
let quality = compute_commit_quality(&commit);
assert!(
quality < 0.5,
"Large refactor should have low quality: {}",
quality
);
}
}