use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2, Axis};
use scirs2_core::numeric::{Float, FromPrimitive};
use std::fmt::Debug;
use crate::error::{ClusteringError, Result};
use crate::metrics::silhouette_score;
pub fn davies_bouldin_score<F>(data: ArrayView2<F>, labels: ArrayView1<i32>) -> Result<F>
where
F: Float + FromPrimitive + Debug + PartialOrd + 'static,
{
if data.shape()[0] != labels.shape()[0] {
return Err(ClusteringError::InvalidInput(
"Data and labels must have the same number of samples".to_string(),
));
}
let mut unique_labels = Vec::new();
for &label in labels.iter() {
if label >= 0 && !unique_labels.contains(&label) {
unique_labels.push(label);
}
}
let n_clusters = unique_labels.len();
if n_clusters < 2 {
return Err(ClusteringError::InvalidInput(
"Davies-Bouldin score requires at least 2 clusters".to_string(),
));
}
let mut centers = Array2::<F>::zeros((n_clusters, data.shape()[1]));
let mut cluster_sizes = vec![0; n_clusters];
for (i, &label) in labels.iter().enumerate() {
if label >= 0 {
let cluster_idx = unique_labels
.iter()
.position(|&l| l == label)
.expect("Operation failed");
centers
.row_mut(cluster_idx)
.scaled_add(F::one(), &data.row(i));
cluster_sizes[cluster_idx] += 1;
}
}
for (i, &size) in cluster_sizes.iter().enumerate() {
if size > 0 {
centers
.row_mut(i)
.mapv_inplace(|x| x / F::from(size).expect("Failed to convert to float"));
}
}
let mut scatter = vec![F::zero(); n_clusters];
for (i, &label) in labels.iter().enumerate() {
if label >= 0 {
let cluster_idx = unique_labels
.iter()
.position(|&l| l == label)
.expect("Operation failed");
let center = centers.row(cluster_idx);
let diff = &data.row(i) - ¢er;
let distance = diff.dot(&diff).sqrt();
scatter[cluster_idx] = scatter[cluster_idx] + distance;
}
}
for (i, &size) in cluster_sizes.iter().enumerate() {
if size > 0 {
scatter[i] = scatter[i] / F::from(size).expect("Failed to convert to float");
}
}
let mut db_index = F::zero();
for i in 0..n_clusters {
let mut max_ratio = F::zero();
for j in 0..n_clusters {
if i != j {
let between_distance = (¢ers.row(i) - ¢ers.row(j))
.mapv(|x| x * x)
.sum()
.sqrt();
if between_distance > F::zero() {
let ratio = (scatter[i] + scatter[j]) / between_distance;
if ratio > max_ratio {
max_ratio = ratio;
}
}
}
}
db_index = db_index + max_ratio;
}
db_index = db_index / F::from(n_clusters).expect("Failed to convert to float");
Ok(db_index)
}
pub fn calinski_harabasz_score<F>(data: ArrayView2<F>, labels: ArrayView1<i32>) -> Result<F>
where
F: Float + FromPrimitive + Debug + PartialOrd + 'static,
{
if data.shape()[0] != labels.shape()[0] {
return Err(ClusteringError::InvalidInput(
"Data and labels must have the same number of samples".to_string(),
));
}
let n_samples = data.shape()[0];
let n_features = data.shape()[1];
let mut unique_labels = Vec::new();
for &label in labels.iter() {
if label >= 0 && !unique_labels.contains(&label) {
unique_labels.push(label);
}
}
let n_clusters = unique_labels.len();
if n_clusters < 2 {
return Err(ClusteringError::InvalidInput(
"Calinski-Harabasz score requires at least 2 clusters".to_string(),
));
}
if n_clusters >= n_samples {
return Err(ClusteringError::InvalidInput(
"Number of clusters must be less than number of samples".to_string(),
));
}
let mut overall_mean = Array1::<F>::zeros(n_features);
let mut valid_samples = 0;
for (i, &label) in labels.iter().enumerate() {
if label >= 0 {
overall_mean.scaled_add(F::one(), &data.row(i));
valid_samples += 1;
}
}
overall_mean.mapv_inplace(|x| x / F::from(valid_samples).expect("Failed to convert to float"));
let mut centers = Array2::<F>::zeros((n_clusters, n_features));
let mut cluster_sizes = vec![0; n_clusters];
for (i, &label) in labels.iter().enumerate() {
if label >= 0 {
let cluster_idx = unique_labels
.iter()
.position(|&l| l == label)
.expect("Operation failed");
centers
.row_mut(cluster_idx)
.scaled_add(F::one(), &data.row(i));
cluster_sizes[cluster_idx] += 1;
}
}
for (i, &size) in cluster_sizes.iter().enumerate() {
if size > 0 {
centers
.row_mut(i)
.mapv_inplace(|x| x / F::from(size).expect("Failed to convert to float"));
}
}
let mut ssb = F::zero();
for (i, &size) in cluster_sizes.iter().enumerate() {
if size > 0 {
let diff = ¢ers.row(i) - &overall_mean;
ssb = ssb + F::from(size).expect("Failed to convert to float") * diff.dot(&diff);
}
}
let mut ssw = F::zero();
for (i, &label) in labels.iter().enumerate() {
if label >= 0 {
let cluster_idx = unique_labels
.iter()
.position(|&l| l == label)
.expect("Operation failed");
let diff = &data.row(i) - ¢ers.row(cluster_idx);
ssw = ssw + diff.dot(&diff);
}
}
if ssw == F::zero() {
return Ok(F::infinity());
}
let score = (ssb / ssw)
* F::from(valid_samples - n_clusters).expect("Failed to convert to float")
/ F::from(n_clusters - 1).expect("Failed to convert to float");
Ok(score)
}
pub fn mean_silhouette_score<F>(data: ArrayView2<F>, labels: ArrayView1<i32>) -> Result<F>
where
F: Float + FromPrimitive + 'static,
{
silhouette_score(data, labels)
}
pub fn adjusted_rand_index<F>(
labels_true: ArrayView1<i32>,
labels_pred: ArrayView1<i32>,
) -> Result<F>
where
F: Float + FromPrimitive + Debug + 'static,
{
if labels_true.len() != labels_pred.len() {
return Err(ClusteringError::InvalidInput(
"Labels arrays must have the same length".to_string(),
));
}
let n = labels_true.len();
if n == 0 {
return Err(ClusteringError::InvalidInput(
"Empty labels arrays".to_string(),
));
}
let mut true_labels = std::collections::HashSet::new();
let mut pred_labels = std::collections::HashSet::new();
for &label in labels_true.iter() {
true_labels.insert(label);
}
for &label in labels_pred.iter() {
pred_labels.insert(label);
}
let n_true = true_labels.len();
let n_pred = pred_labels.len();
let true_label_map: std::collections::HashMap<i32, usize> = true_labels
.iter()
.enumerate()
.map(|(i, &label)| (label, i))
.collect();
let pred_label_map: std::collections::HashMap<i32, usize> = pred_labels
.iter()
.enumerate()
.map(|(i, &label)| (label, i))
.collect();
let mut contingency = Array2::<usize>::zeros((n_true, n_pred));
for i in 0..n {
let true_idx = true_label_map[&labels_true[i]];
let pred_idx = pred_label_map[&labels_pred[i]];
contingency[[true_idx, pred_idx]] += 1;
}
let sum_comb_c = contingency
.iter()
.map(|&n_ij| {
if n_ij >= 2 {
(n_ij * (n_ij - 1)) / 2
} else {
0
}
})
.sum::<usize>();
let sum_a = contingency
.sum_axis(Axis(1))
.iter()
.map(|&n_i| if n_i >= 2 { (n_i * (n_i - 1)) / 2 } else { 0 })
.sum::<usize>();
let sum_b = contingency
.sum_axis(Axis(0))
.iter()
.map(|&n_j| if n_j >= 2 { (n_j * (n_j - 1)) / 2 } else { 0 })
.sum::<usize>();
let n_choose_2 = if n >= 2 { (n * (n - 1)) / 2 } else { 0 };
let expected_index = F::from(sum_a).expect("Failed to convert to float")
* F::from(sum_b).expect("Failed to convert to float")
/ F::from(n_choose_2).expect("Failed to convert to float");
let max_index = (F::from(sum_a).expect("Failed to convert to float")
+ F::from(sum_b).expect("Failed to convert to float"))
/ F::from(2.0).expect("Failed to convert constant to float");
let index = F::from(sum_comb_c).expect("Failed to convert to float");
if max_index == expected_index {
return Ok(F::zero());
}
let ari = (index - expected_index) / (max_index - expected_index);
Ok(ari)
}
pub fn normalized_mutual_info<F>(
labels_true: ArrayView1<i32>,
labels_pred: ArrayView1<i32>,
average_method: &str,
) -> Result<F>
where
F: Float + FromPrimitive + Debug + 'static,
{
if labels_true.len() != labels_pred.len() {
return Err(ClusteringError::InvalidInput(
"Labels arrays must have the same length".to_string(),
));
}
let n = labels_true.len();
if n == 0 {
return Ok(F::one());
}
let mi = mutual_info::<F>(labels_true, labels_pred)?;
let h_true = entropy::<F>(labels_true)?;
let h_pred = entropy::<F>(labels_pred)?;
if h_true == F::zero() && h_pred == F::zero() {
return Ok(F::one());
}
let normalizer = match average_method {
"arithmetic" => {
(h_true + h_pred) / F::from(2.0).expect("Failed to convert constant to float")
}
"geometric" => (h_true * h_pred).sqrt(),
"min" => h_true.min(h_pred),
"max" => h_true.max(h_pred),
_ => {
return Err(ClusteringError::InvalidInput(
"Invalid average method. Use 'arithmetic', 'geometric', 'min', or 'max'"
.to_string(),
))
}
};
if normalizer == F::zero() {
return Ok(F::zero());
}
Ok(mi / normalizer)
}
pub fn homogeneity_completeness_v_measure<F>(
labels_true: ArrayView1<i32>,
labels_pred: ArrayView1<i32>,
) -> Result<(F, F, F)>
where
F: Float + FromPrimitive + Debug + 'static,
{
if labels_true.len() != labels_pred.len() {
return Err(ClusteringError::InvalidInput(
"Labels arrays must have the same length".to_string(),
));
}
let n = labels_true.len();
if n == 0 {
return Ok((F::one(), F::one(), F::one()));
}
let h_true = entropy::<F>(labels_true)?;
let h_pred = entropy::<F>(labels_pred)?;
if h_true == F::zero() {
return Ok((F::one(), F::one(), F::one()));
}
if h_pred == F::zero() {
return Ok((F::one(), F::one(), F::one()));
}
let h_true_given_pred = conditional_entropy::<F>(labels_true, labels_pred)?;
let h_pred_given_true = conditional_entropy::<F>(labels_pred, labels_true)?;
let homogeneity = if h_pred == F::zero() {
F::one()
} else {
F::one() - h_true_given_pred / h_true
};
let completeness = if h_true == F::zero() {
F::one()
} else {
F::one() - h_pred_given_true / h_pred
};
let v_measure = if homogeneity + completeness == F::zero() {
F::zero()
} else {
F::from(2.0).expect("Failed to convert constant to float") * homogeneity * completeness
/ (homogeneity + completeness)
};
Ok((homogeneity, completeness, v_measure))
}
fn mutual_info<F>(labels_true: ArrayView1<i32>, labels_pred: ArrayView1<i32>) -> Result<F>
where
F: Float + FromPrimitive + Debug + 'static,
{
let n = labels_true.len() as f64;
let contingency = build_contingency_matrix(labels_true, labels_pred)?;
let mut mi = F::zero();
let n_rows = contingency.shape()[0];
let n_cols = contingency.shape()[1];
let row_sums = contingency.sum_axis(Axis(1));
let col_sums = contingency.sum_axis(Axis(0));
for i in 0..n_rows {
for j in 0..n_cols {
let n_ij = contingency[[i, j]] as f64;
if n_ij > 0.0 {
let n_i = row_sums[i] as f64;
let n_j = col_sums[j] as f64;
let term = n_ij / n * (n_ij / (n_i * n_j / n)).ln();
mi = mi + F::from(term).expect("Failed to convert to float");
}
}
}
Ok(mi)
}
fn entropy<F>(labels: ArrayView1<i32>) -> Result<F>
where
F: Float + FromPrimitive + Debug + 'static,
{
let n = labels.len() as f64;
let mut label_counts = std::collections::HashMap::new();
for &label in labels.iter() {
*label_counts.entry(label).or_insert(0) += 1;
}
let mut h = F::zero();
for &count in label_counts.values() {
if count > 0 {
let p = count as f64 / n;
h = h - F::from(p * p.ln()).expect("Operation failed");
}
}
Ok(h)
}
fn build_contingency_matrix(
labels_true: ArrayView1<i32>,
labels_pred: ArrayView1<i32>,
) -> Result<Array2<usize>> {
let mut true_labels = std::collections::BTreeSet::new();
let mut pred_labels = std::collections::BTreeSet::new();
for &label in labels_true.iter() {
true_labels.insert(label);
}
for &label in labels_pred.iter() {
pred_labels.insert(label);
}
let true_label_map: std::collections::HashMap<i32, usize> = true_labels
.iter()
.enumerate()
.map(|(i, &label)| (label, i))
.collect();
let pred_label_map: std::collections::HashMap<i32, usize> = pred_labels
.iter()
.enumerate()
.map(|(i, &label)| (label, i))
.collect();
let mut contingency = Array2::<usize>::zeros((true_labels.len(), pred_labels.len()));
for i in 0..labels_true.len() {
let true_idx = true_label_map[&labels_true[i]];
let pred_idx = pred_label_map[&labels_pred[i]];
contingency[[true_idx, pred_idx]] += 1;
}
Ok(contingency)
}
fn conditional_entropy<F>(labels_x: ArrayView1<i32>, labels_y: ArrayView1<i32>) -> Result<F>
where
F: Float + FromPrimitive + Debug + 'static,
{
let n = labels_x.len() as f64;
let contingency = build_contingency_matrix(labels_x, labels_y)?;
let mut h_xy = F::zero();
let col_sums = contingency.sum_axis(Axis(0));
for j in 0..contingency.shape()[1] {
let n_j = col_sums[j] as f64;
if n_j > 0.0 {
for i in 0..contingency.shape()[0] {
let n_ij = contingency[[i, j]] as f64;
if n_ij > 0.0 {
let term = n_ij / n * (n_ij / n_j).ln();
h_xy = h_xy - F::from(term).expect("Failed to convert to float");
}
}
}
}
Ok(h_xy)
}
#[cfg(test)]
mod tests {
use super::*;
use scirs2_core::ndarray::Array2;
#[test]
fn test_davies_bouldin_score() {
let data = Array2::from_shape_vec((4, 2), vec![0.0, 0.0, 0.1, 0.1, 5.0, 5.0, 5.1, 5.1])
.expect("Operation failed");
let labels = Array1::from_vec(vec![0, 0, 1, 1]);
let score = davies_bouldin_score(data.view(), labels.view()).expect("Operation failed");
assert!(score >= 0.0);
}
#[test]
fn test_calinski_harabasz_score() {
let data = Array2::from_shape_vec((4, 2), vec![0.0, 0.0, 0.1, 0.1, 5.0, 5.0, 5.1, 5.1])
.expect("Operation failed");
let labels = Array1::from_vec(vec![0, 0, 1, 1]);
let score = calinski_harabasz_score(data.view(), labels.view()).expect("Operation failed");
assert!(score > 0.0);
}
#[test]
fn test_adjusted_rand_index() {
let labels_true = Array1::from_vec(vec![0, 0, 1, 1, 2, 2]);
let labels_pred = Array1::from_vec(vec![0, 0, 2, 2, 1, 1]);
let ari: f64 =
adjusted_rand_index(labels_true.view(), labels_pred.view()).expect("Operation failed");
assert!(ari >= -1.0 && ari <= 1.0);
}
#[test]
fn test_normalized_mutual_info() {
let labels_true = Array1::from_vec(vec![0, 0, 1, 1]);
let labels_pred = Array1::from_vec(vec![0, 0, 1, 1]);
let nmi: f64 = normalized_mutual_info(labels_true.view(), labels_pred.view(), "arithmetic")
.expect("Operation failed");
assert!((nmi - 1.0).abs() < 1e-6);
}
#[test]
fn test_homogeneity_completeness_v_measure() {
let labels_true = Array1::from_vec(vec![0, 0, 1, 1, 2, 2]);
let labels_pred = Array1::from_vec(vec![0, 0, 1, 1, 1, 1]);
let (h, c, v): (f64, f64, f64) =
homogeneity_completeness_v_measure(labels_true.view(), labels_pred.view())
.expect("Operation failed");
assert!(h >= 0.0 && h <= 1.0);
assert!(c >= 0.0 && c <= 1.0);
assert!(v >= 0.0 && v <= 1.0);
}
}