use std::collections::HashMap;
fn label_to_index(labels: &[f64]) -> (Vec<usize>, usize) {
let mut map: HashMap<i64, usize> = HashMap::new();
let mut next = 0usize;
let indices: Vec<usize> = labels
.iter()
.map(|&v| {
let key = v as i64;
*map.entry(key).or_insert_with(|| {
let idx = next;
next += 1;
idx
})
})
.collect();
(indices, next)
}
pub fn adjusted_rand_index(labels_true: &[f64], labels_pred: &[f64]) -> f64 {
let n = labels_true.len();
if n == 0 {
return 0.0;
}
let (true_idx, n_true) = label_to_index(labels_true);
let (pred_idx, n_pred) = label_to_index(labels_pred);
let mut contingency = vec![vec![0i64; n_pred]; n_true];
for i in 0..n {
contingency[true_idx[i]][pred_idx[i]] += 1;
}
let a: Vec<i64> = contingency.iter().map(|row| row.iter().sum()).collect();
let b: Vec<i64> = (0..n_pred)
.map(|j| contingency.iter().map(|row| row[j]).sum())
.collect();
let comb2 = |x: i64| -> i64 { x * (x - 1) / 2 };
let sum_comb_c: i64 = contingency
.iter()
.flat_map(|row| row.iter())
.map(|&x| comb2(x))
.sum();
let sum_comb_a: i64 = a.iter().map(|&x| comb2(x)).sum();
let sum_comb_b: i64 = b.iter().map(|&x| comb2(x)).sum();
let n_i64 = i64::try_from(n).expect("sample count overflows i64");
let comb_n = comb2(n_i64);
if comb_n == 0 {
return 0.0;
}
let expected = sum_comb_a as f64 * sum_comb_b as f64 / comb_n as f64;
let max_index = f64::midpoint(sum_comb_a as f64, sum_comb_b as f64);
if (max_index - expected).abs() < 1e-15 {
return if (sum_comb_c as f64 - expected).abs() < 1e-15 {
1.0
} else {
0.0
};
}
(sum_comb_c as f64 - expected) / (max_index - expected)
}
pub fn calinski_harabasz_score(features: &[Vec<f64>], labels: &[f64]) -> f64 {
let n = labels.len();
if n == 0 || features.is_empty() {
return 0.0;
}
let n_features = features.len();
let (label_idx, k) = label_to_index(labels);
if k <= 1 || k >= n {
return 0.0;
}
let global_centroid: Vec<f64> = features
.iter()
.map(|col| col.iter().sum::<f64>() / n as f64)
.collect();
let mut cluster_sums = vec![vec![0.0_f64; n_features]; k];
let mut cluster_counts = vec![0usize; k];
for i in 0..n {
let ci = label_idx[i];
cluster_counts[ci] += 1;
for f in 0..n_features {
cluster_sums[ci][f] += features[f][i];
}
}
let cluster_centroids: Vec<Vec<f64>> = cluster_sums
.iter()
.zip(cluster_counts.iter())
.map(|(sums, &cnt)| {
if cnt == 0 {
vec![0.0; n_features]
} else {
sums.iter().map(|s| s / cnt as f64).collect()
}
})
.collect();
let mut bgss = 0.0;
for ci in 0..k {
let cnt = cluster_counts[ci] as f64;
let dist_sq: f64 = cluster_centroids[ci]
.iter()
.zip(global_centroid.iter())
.map(|(a, b)| (a - b).powi(2))
.sum();
bgss += cnt * dist_sq;
}
let mut wgss = 0.0;
for i in 0..n {
let ci = label_idx[i];
for f in 0..n_features {
wgss += (features[f][i] - cluster_centroids[ci][f]).powi(2);
}
}
if wgss < 1e-15 {
return 0.0;
}
(bgss / (k as f64 - 1.0)) / (wgss / (n as f64 - k as f64))
}
pub fn davies_bouldin_score(features: &[Vec<f64>], labels: &[f64]) -> f64 {
let n = labels.len();
if n == 0 || features.is_empty() {
return 0.0;
}
let n_features = features.len();
let (label_idx, k) = label_to_index(labels);
if k <= 1 {
return 0.0;
}
let mut cluster_sums = vec![vec![0.0_f64; n_features]; k];
let mut cluster_counts = vec![0usize; k];
for i in 0..n {
let ci = label_idx[i];
cluster_counts[ci] += 1;
for f in 0..n_features {
cluster_sums[ci][f] += features[f][i];
}
}
let centroids: Vec<Vec<f64>> = cluster_sums
.iter()
.zip(cluster_counts.iter())
.map(|(sums, &cnt)| {
if cnt == 0 {
vec![0.0; n_features]
} else {
sums.iter().map(|s| s / cnt as f64).collect()
}
})
.collect();
let mut scatter = vec![0.0_f64; k];
for i in 0..n {
let ci = label_idx[i];
let dist: f64 = (0..n_features)
.map(|f| (features[f][i] - centroids[ci][f]).powi(2))
.sum::<f64>()
.sqrt();
scatter[ci] += dist;
}
for ci in 0..k {
if cluster_counts[ci] > 0 {
scatter[ci] /= cluster_counts[ci] as f64;
}
}
let mut db_sum = 0.0;
for i in 0..k {
let mut max_r = f64::NEG_INFINITY;
for j in 0..k {
if i == j {
continue;
}
let d_ij: f64 = centroids[i]
.iter()
.zip(centroids[j].iter())
.map(|(a, b)| (a - b).powi(2))
.sum::<f64>()
.sqrt();
let r = if d_ij < 1e-15 {
0.0
} else {
(scatter[i] + scatter[j]) / d_ij
};
if r > max_r {
max_r = r;
}
}
db_sum += max_r;
}
db_sum / k as f64
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ari_identical() {
let labels = vec![0.0, 0.0, 1.0, 1.0, 2.0, 2.0];
let ari = adjusted_rand_index(&labels, &labels);
assert!(
(ari - 1.0).abs() < 1e-10,
"identical ARI should be 1.0, got {ari}"
);
}
#[test]
fn test_ari_permuted() {
let true_labels = vec![0.0, 0.0, 1.0, 1.0];
let pred_labels = vec![5.0, 5.0, 3.0, 3.0];
let ari = adjusted_rand_index(&true_labels, &pred_labels);
assert!(
(ari - 1.0).abs() < 1e-10,
"permuted ARI should be 1.0, got {ari}"
);
}
#[test]
fn test_ari_random() {
let true_labels = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
let pred_labels = vec![0.0, 1.0, 0.0, 1.0, 0.0, 1.0];
let ari = adjusted_rand_index(&true_labels, &pred_labels);
assert!(ari < 0.5, "random ARI should be low, got {ari}");
}
#[test]
fn test_calinski_harabasz_well_separated() {
let f0 = vec![0.0, 0.1, 0.2, 10.0, 10.1, 10.2];
let f1 = vec![0.0, 0.1, 0.2, 10.0, 10.1, 10.2];
let labels = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
let ch = calinski_harabasz_score(&[f0, f1], &labels);
assert!(ch > 100.0, "well-separated CH should be high, got {ch}");
}
#[test]
fn test_davies_bouldin_well_separated() {
let f0 = vec![0.0, 0.1, 0.2, 10.0, 10.1, 10.2];
let f1 = vec![0.0, 0.1, 0.2, 10.0, 10.1, 10.2];
let labels = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
let db = davies_bouldin_score(&[f0, f1], &labels);
assert!(db < 0.5, "well-separated DB should be low, got {db}");
}
#[test]
fn test_davies_bouldin_overlapping() {
let f0 = vec![0.0, 1.0, 2.0, 1.5, 2.5, 3.0];
let f1 = vec![0.0, 1.0, 2.0, 1.5, 2.5, 3.0];
let labels = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
let db = davies_bouldin_score(&[f0, f1], &labels);
assert!(db > 0.3, "overlapping DB should be non-trivial, got {db}");
}
}