use anyhow::Result;
use chrono::{DateTime, Utc};
use std::collections::HashMap;
use crate::graph::interner::{global_interner, StrKey};
#[derive(Debug, Clone)]
pub struct CoChangeConfig {
pub half_life_days: f64,
pub min_weight: f32,
pub max_files_per_commit: usize,
pub max_commits: usize,
pub min_decay: f32,
}
impl Default for CoChangeConfig {
fn default() -> Self {
Self {
half_life_days: 90.0,
min_weight: 0.5,
max_files_per_commit: 30,
max_commits: 5000,
min_decay: 0.001,
}
}
}
#[derive(Debug)]
pub struct CoChangeMatrix {
entries: HashMap<(StrKey, StrKey), f32>,
half_life_days: f64,
commits_analyzed: usize,
file_weights: HashMap<StrKey, f32>,
total_decay_weight: f32,
pair_counts: HashMap<(StrKey, StrKey), u32>,
file_counts: HashMap<StrKey, u32>,
coupling_degrees: HashMap<StrKey, usize>,
}
impl CoChangeMatrix {
pub fn empty() -> Self {
Self {
entries: HashMap::new(),
half_life_days: 90.0,
commits_analyzed: 0,
file_weights: HashMap::new(),
total_decay_weight: 0.0,
pair_counts: HashMap::new(),
file_counts: HashMap::new(),
coupling_degrees: HashMap::new(),
}
}
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
pub fn len(&self) -> usize {
self.entries.len()
}
pub fn commits_analyzed(&self) -> usize {
self.commits_analyzed
}
pub fn half_life_days(&self) -> f64 {
self.half_life_days
}
pub fn weight(&self, a: StrKey, b: StrKey) -> Option<f32> {
let (lo, hi) = canonical_pair(a, b);
self.entries.get(&(lo, hi)).copied()
}
pub fn weight_by_path(&self, a: &str, b: &str) -> Option<f32> {
let si = global_interner();
let ka = si.get(a)?;
let kb = si.get(b)?;
self.weight(ka, kb)
}
pub fn file_weight(&self, file: StrKey) -> Option<f32> {
self.file_weights.get(&file).copied()
}
pub fn total_decay_weight(&self) -> f32 {
self.total_decay_weight
}
pub fn file_count(&self) -> usize {
self.file_weights.len()
}
pub fn pair_commit_count(&self, a: StrKey, b: StrKey) -> u32 {
let (lo, hi) = canonical_pair(a, b);
self.pair_counts.get(&(lo, hi)).copied().unwrap_or(0)
}
pub fn file_commit_count(&self, file: StrKey) -> u32 {
self.file_counts.get(&file).copied().unwrap_or(0)
}
pub fn confidence(&self, a: StrKey, b: StrKey) -> f32 {
let pair = self.pair_commit_count(a, b);
if pair == 0 {
return 0.0;
}
let count_a = self.file_commit_count(a);
let count_b = self.file_commit_count(b);
if count_a == 0 || count_b == 0 {
return 0.0;
}
let conf_ab = pair as f32 / count_a as f32;
let conf_ba = pair as f32 / count_b as f32;
conf_ab.min(conf_ba)
}
pub fn coupling_degree(&self, file: StrKey) -> usize {
self.coupling_degrees.get(&file).copied().unwrap_or(0)
}
pub fn lift(&self, a: StrKey, b: StrKey) -> Option<f32> {
let pair_weight = self.weight(a, b)?;
let weight_a = self.file_weight(a)?;
let weight_b = self.file_weight(b)?;
if weight_a == 0.0 || weight_b == 0.0 || self.total_decay_weight == 0.0 {
return None;
}
let n = self.file_weights.len() as f32;
let alpha: f32 = 1.0;
let numerator = (pair_weight + alpha) * (self.total_decay_weight + alpha * n * n);
let denominator = (weight_a + alpha * n) * (weight_b + alpha * n);
Some(numerator / denominator)
}
pub fn iter(&self) -> impl Iterator<Item = (&(StrKey, StrKey), &f32)> {
self.entries.iter()
}
pub fn from_commits(
commits: &[(DateTime<Utc>, Vec<String>)],
config: &CoChangeConfig,
now: DateTime<Utc>,
) -> Self {
let si = global_interner();
let ln2: f64 = std::f64::consts::LN_2;
let mut entries: HashMap<(StrKey, StrKey), f32> = HashMap::new();
let mut file_weights: HashMap<StrKey, f32> = HashMap::new();
let mut total_decay_weight: f32 = 0.0;
let mut commits_analyzed: usize = 0;
let mut pair_counts: HashMap<(StrKey, StrKey), u32> = HashMap::new();
let mut file_counts: HashMap<StrKey, u32> = HashMap::new();
let limit = commits.len().min(config.max_commits);
for (ts, files) in commits.iter().take(limit) {
let age_days = (now - *ts).num_seconds().max(0) as f64 / 86_400.0;
let decay = (-ln2 * age_days / config.half_life_days).exp() as f32;
if config.min_decay > 0.0 && decay < config.min_decay {
break;
}
if files.len() > config.max_files_per_commit {
continue;
}
commits_analyzed += 1;
let mut keys: Vec<StrKey> = files.iter().map(|f| si.intern(f)).collect();
keys.sort();
keys.dedup();
for &file_key in &keys {
*file_weights.entry(file_key).or_insert(0.0) += decay;
*file_counts.entry(file_key).or_insert(0) += 1;
}
total_decay_weight += decay;
for i in 0..keys.len() {
for j in (i + 1)..keys.len() {
*entries.entry((keys[i], keys[j])).or_insert(0.0) += decay;
*pair_counts.entry((keys[i], keys[j])).or_insert(0) += 1;
}
}
}
entries.retain(|_, w| *w >= config.min_weight);
let mut coupling_degrees: HashMap<StrKey, usize> = HashMap::new();
for &(a, b) in pair_counts.keys() {
*coupling_degrees.entry(a).or_insert(0) += 1;
*coupling_degrees.entry(b).or_insert(0) += 1;
}
Self {
entries,
half_life_days: config.half_life_days,
commits_analyzed,
file_weights,
total_decay_weight,
pair_counts,
file_counts,
coupling_degrees,
}
}
}
fn canonical_pair(a: StrKey, b: StrKey) -> (StrKey, StrKey) {
if a <= b {
(a, b)
} else {
(b, a)
}
}
pub fn compute_from_repo(
repo_path: &std::path::Path,
config: &CoChangeConfig,
) -> Result<CoChangeMatrix> {
use crate::git::history::GitHistory;
let history = GitHistory::open(repo_path)?;
let commits = history.get_recent_commits_paths_only(config.max_commits)?;
if commits.len() <= 1 {
return Ok(CoChangeMatrix::empty());
}
Ok(CoChangeMatrix::from_commits(&commits, config, Utc::now()))
}
#[allow(dead_code)]
fn compute_from_commit_info(
raw_commits: Vec<crate::git::history::CommitInfo>,
config: &CoChangeConfig,
) -> CoChangeMatrix {
let now = Utc::now();
let commits: Vec<(DateTime<Utc>, Vec<String>)> = raw_commits
.into_iter()
.filter_map(|c| {
let ts = DateTime::parse_from_rfc3339(&c.timestamp)
.ok()?
.with_timezone(&Utc);
Some((ts, c.files_changed))
})
.collect();
if commits.len() <= 1 {
return CoChangeMatrix::empty();
}
CoChangeMatrix::from_commits(&commits, config, now)
}
#[cfg(test)]
mod tests {
use super::*;
use chrono::Duration;
fn commit_at(
now: DateTime<Utc>,
age_days: i64,
files: Vec<&str>,
) -> (DateTime<Utc>, Vec<String>) {
let ts = now - Duration::days(age_days);
(ts, files.into_iter().map(String::from).collect())
}
fn default_config() -> CoChangeConfig {
CoChangeConfig::default()
}
#[test]
fn test_empty_matrix() {
let m = CoChangeMatrix::empty();
assert!(m.is_empty());
assert_eq!(m.len(), 0);
assert_eq!(m.commits_analyzed(), 0);
}
#[test]
fn test_single_commit_two_files() {
let now = Utc::now();
let commits = vec![commit_at(now, 0, vec!["src/a.rs", "src/b.rs"])];
let config = default_config();
let m = CoChangeMatrix::from_commits(&commits, &config, now);
assert_eq!(m.len(), 1);
assert_eq!(m.commits_analyzed(), 1);
let w = m.weight_by_path("src/a.rs", "src/b.rs");
assert!(w.is_some());
let w = w.expect("weight should exist");
assert!((w - 1.0).abs() < 0.01, "expected ~1.0, got {w}");
let w2 = m.weight_by_path("src/b.rs", "src/a.rs");
assert_eq!(w2, Some(w));
}
#[test]
fn test_decay_reduces_old_commits() {
let now = Utc::now();
let config = CoChangeConfig {
half_life_days: 90.0,
min_weight: 0.01, ..default_config()
};
let commits = vec![
commit_at(now, 0, vec!["src/a.rs", "src/b.rs"]),
commit_at(now, 180, vec!["src/c.rs", "src/d.rs"]),
];
let m = CoChangeMatrix::from_commits(&commits, &config, now);
let w_recent = m
.weight_by_path("src/a.rs", "src/b.rs")
.expect("recent pair");
let w_old = m.weight_by_path("src/c.rs", "src/d.rs").expect("old pair");
assert!(
w_recent > w_old * 3.0,
"recent ({w_recent}) should be much larger than old ({w_old})"
);
assert!(
(w_old - 0.25).abs() < 0.05,
"expected ~0.25 for 2 half-lives, got {w_old}"
);
}
#[test]
fn test_skip_large_commits() {
let now = Utc::now();
let config = CoChangeConfig {
max_files_per_commit: 3,
min_weight: 0.01,
..default_config()
};
let commits = vec![commit_at(now, 0, vec!["a.rs", "b.rs", "c.rs", "d.rs"])];
let m = CoChangeMatrix::from_commits(&commits, &config, now);
assert!(m.is_empty(), "large commit should be skipped");
assert_eq!(m.commits_analyzed(), 0);
}
#[test]
fn test_min_weight_filter() {
let now = Utc::now();
let config = CoChangeConfig {
half_life_days: 10.0,
min_weight: 0.5,
min_decay: 0.0,
..default_config()
};
let commits = vec![commit_at(now, 100, vec!["old_a.rs", "old_b.rs"])];
let m = CoChangeMatrix::from_commits(&commits, &config, now);
assert!(m.is_empty(), "pair below min_weight should be pruned");
assert_eq!(m.commits_analyzed(), 1);
}
#[test]
fn test_max_commits_cap() {
let now = Utc::now();
let config = CoChangeConfig {
max_commits: 2,
min_weight: 0.01,
..default_config()
};
let commits: Vec<_> = (0..5)
.map(|i| commit_at(now, i, vec!["a.py", "b.py"]))
.collect();
let m = CoChangeMatrix::from_commits(&commits, &config, now);
assert_eq!(m.commits_analyzed(), 2);
assert_eq!(m.len(), 1);
}
#[test]
fn test_three_files_produce_three_pairs() {
let now = Utc::now();
let config = default_config();
let commits = vec![commit_at(now, 0, vec!["x.rs", "y.rs", "z.rs"])];
let m = CoChangeMatrix::from_commits(&commits, &config, now);
assert_eq!(m.len(), 3);
assert!(m.weight_by_path("x.rs", "y.rs").is_some());
assert!(m.weight_by_path("x.rs", "z.rs").is_some());
assert!(m.weight_by_path("y.rs", "z.rs").is_some());
}
#[test]
fn test_accumulates_across_commits() {
let now = Utc::now();
let config = CoChangeConfig {
min_weight: 0.01,
..default_config()
};
let commits = vec![
commit_at(now, 0, vec!["shared_a.rs", "shared_b.rs"]),
commit_at(now, 1, vec!["shared_a.rs", "shared_b.rs"]),
];
let m = CoChangeMatrix::from_commits(&commits, &config, now);
let w = m
.weight_by_path("shared_a.rs", "shared_b.rs")
.expect("accumulated pair");
assert!(w > 1.5, "expected accumulated weight > 1.5, got {w}");
}
#[test]
fn test_file_weights_tracked() {
let now = Utc::now();
let config = CoChangeConfig {
min_weight: 0.01,
..default_config()
};
let commits = vec![
commit_at(now, 0, vec!["a.rs", "b.rs"]),
commit_at(now, 0, vec!["a.rs", "b.rs"]),
commit_at(now, 0, vec!["a.rs", "c.rs"]),
];
let m = CoChangeMatrix::from_commits(&commits, &config, now);
let si = global_interner();
let ka = si.get("a.rs").expect("a.rs should be interned");
let kb = si.get("b.rs").expect("b.rs should be interned");
let kc = si.get("c.rs").expect("c.rs should be interned");
let wa = m.file_weight(ka).expect("a.rs should have file weight");
let wb = m.file_weight(kb).expect("b.rs should have file weight");
let wc = m.file_weight(kc).expect("c.rs should have file weight");
assert!(
(wa - 3.0).abs() < 0.01,
"a.rs appears in 3 commits, expected ~3.0, got {wa}"
);
assert!(
(wb - 2.0).abs() < 0.01,
"b.rs appears in 2 commits, expected ~2.0, got {wb}"
);
assert!(
(wc - 1.0).abs() < 0.01,
"c.rs appears in 1 commit, expected ~1.0, got {wc}"
);
}
#[test]
fn test_total_decay_weight() {
let now = Utc::now();
let config = CoChangeConfig {
min_weight: 0.01,
..default_config()
};
let commits = vec![
commit_at(now, 0, vec!["a.rs", "b.rs"]),
commit_at(now, 0, vec!["c.rs", "d.rs"]),
commit_at(now, 0, vec!["e.rs"]),
];
let m = CoChangeMatrix::from_commits(&commits, &config, now);
assert!(
(m.total_decay_weight() - 3.0).abs() < 0.01,
"expected total_decay_weight ~3.0, got {}",
m.total_decay_weight()
);
}
#[test]
fn test_lift_computation() {
let now = Utc::now();
let config = CoChangeConfig {
min_weight: 0.01,
..default_config()
};
let commits = vec![
commit_at(now, 0, vec!["a.rs", "b.rs"]),
commit_at(now, 0, vec!["a.rs", "b.rs"]),
commit_at(now, 0, vec!["a.rs", "c.rs"]),
commit_at(now, 0, vec!["d.rs", "e.rs"]),
commit_at(now, 0, vec!["f.rs", "g.rs"]),
];
let m = CoChangeMatrix::from_commits(&commits, &config, now);
let si = global_interner();
let ka = si.get("a.rs").expect("a.rs interned");
let kb = si.get("b.rs").expect("b.rs interned");
let lift = m.lift(ka, kb).expect("lift should be computable");
let n: f32 = 7.0;
let expected = (2.0 + 1.0) * (5.0 + 1.0 * n * n) / ((3.0 + 1.0 * n) * (2.0 + 1.0 * n));
assert!(
(lift - expected).abs() < 0.05,
"expected lift ~{expected:.3}, got {lift:.3}"
);
let kd = si.get("d.rs").expect("d.rs interned");
let ke = si.get("e.rs").expect("e.rs interned");
let lift_de = m.lift(kd, ke).expect("lift should be computable for d,e");
let expected_de = (1.0 + 1.0) * (5.0 + 1.0 * n * n) / ((1.0 + 1.0 * n) * (1.0 + 1.0 * n));
assert!(
(lift_de - expected_de).abs() < 0.05,
"expected lift ~{expected_de:.3} for exclusive pair, got {lift_de:.3}"
);
}
#[test]
fn test_lift_none_for_missing_files() {
let m = CoChangeMatrix::empty();
let si = global_interner();
let ka = si.intern("nonexistent_a.rs");
let kb = si.intern("nonexistent_b.rs");
assert!(
m.lift(ka, kb).is_none(),
"lift should be None for files not in matrix"
);
}
}