use scirs2_core::ndarray::{Array1, Array2, ArrayBase, Data, Dimension, Ix2};
use scirs2_core::numeric::{Float, NumCast};
use scirs2_core::random::{rngs::StdRng, Rng, SeedableRng};
use std::collections::HashMap;
use std::ops::{AddAssign, DivAssign};
use crate::clustering::adjusted_rand_index;
use crate::error::{MetricsError, Result};
#[allow(dead_code)]
pub fn jaccard_similarity<S1, S2, D1, D2>(
labels_true: &ArrayBase<S1, D1>,
labels_pred: &ArrayBase<S2, D2>,
) -> Result<f64>
where
S1: Data<Elem = usize>,
S2: Data<Elem = usize>,
D1: Dimension,
D2: Dimension,
{
if labels_true.len() != labels_pred.len() {
return Err(MetricsError::InvalidInput(format!(
"labels_true has {} samples, but labels_pred has {} samples",
labels_true.len(),
labels_pred.len()
)));
}
let n_samples = labels_true.len();
if n_samples <= 1 {
return Err(MetricsError::InvalidInput(
"Number of samples must be greater than 1".to_string(),
));
}
let mut same_true = 0;
let mut same_pred = 0;
let mut same_both = 0;
for i in 0..n_samples {
for j in (i + 1)..n_samples {
let true_i = labels_true.iter().nth(i).expect("Operation failed");
let true_j = labels_true.iter().nth(j).expect("Operation failed");
let pred_i = labels_pred.iter().nth(i).expect("Operation failed");
let pred_j = labels_pred.iter().nth(j).expect("Operation failed");
let same_in_true = true_i == true_j;
let same_in_pred = pred_i == pred_j;
if same_in_true {
same_true += 1;
}
if same_in_pred {
same_pred += 1;
}
if same_in_true && same_in_pred {
same_both += 1;
}
}
}
let union_size = same_true + same_pred - same_both;
let jaccard = if union_size > 0 {
same_both as f64 / union_size as f64
} else {
1.0 };
Ok(jaccard)
}
#[allow(dead_code)]
pub fn cluster_stability<F, S1, S2, D>(
x: &ArrayBase<S1, Ix2>,
labels: &ArrayBase<S2, D>,
n_runs: Option<usize>,
perturbation_scale: Option<F>,
random_seed: Option<u64>,
) -> Result<F>
where
F: Float
+ NumCast
+ std::fmt::Debug
+ scirs2_core::ndarray::ScalarOperand
+ AddAssign
+ DivAssign,
S1: Data<Elem = F>,
S2: Data<Elem = usize>,
D: Dimension,
{
let n_samples = x.shape()[0];
if n_samples != labels.len() {
return Err(MetricsError::InvalidInput(format!(
"x has {} samples, but labels has {} samples",
n_samples,
labels.len()
)));
}
let n_runs = n_runs.unwrap_or(10);
let perturbation_scale = perturbation_scale
.unwrap_or_else(|| F::from(0.1).expect("Failed to convert constant to float"));
if n_runs < 2 {
return Err(MetricsError::InvalidInput(
"Number of _runs must be at least 2".to_string(),
));
}
let _seed = random_seed.unwrap_or_else(scirs2_core::random::random::<u64>);
let mut rng = StdRng::seed_from_u64(_seed);
let mut unique_labels = Vec::new();
for &label in labels.iter() {
if !unique_labels.contains(&label) {
unique_labels.push(label);
}
}
let mut label_indices = HashMap::new();
for (i, &label) in unique_labels.iter().enumerate() {
label_indices.insert(label, i);
}
let mut original_labels = Array1::zeros(n_samples);
for (i, &label) in labels.iter().enumerate() {
original_labels[i] = *label_indices.get(&label).expect("Operation failed");
}
let mut stability_scores = Vec::new();
for _ in 0..n_runs {
let mut perturbed_data = Array2::zeros((n_samples, x.ncols()));
for i in 0..n_samples {
for j in 0..x.ncols() {
let noise: f64 = rng.random_range(-1.0..1.0);
let noise_value =
F::from(noise).expect("Failed to convert to float") * perturbation_scale;
perturbed_data[[i, j]] = x[[i, j]] + noise_value;
}
}
let mut centroids = Array2::zeros((unique_labels.len(), x.ncols()));
let mut counts = vec![0; unique_labels.len()];
for i in 0..n_samples {
let label_idx = original_labels[i];
for j in 0..x.ncols() {
centroids[[label_idx, j]] += x[[i, j]];
}
counts[label_idx] += 1;
}
for i in 0..unique_labels.len() {
if counts[i] > 0 {
for j in 0..x.ncols() {
centroids[[i, j]] /= F::from(counts[i]).expect("Failed to convert to float");
}
}
}
let mut perturbed_labels = Array1::zeros(n_samples);
for i in 0..n_samples {
let mut min_dist = F::infinity();
let mut best_label = 0;
for (label_idx, _) in unique_labels.iter().enumerate() {
let mut dist = F::zero();
for j in 0..x.ncols() {
let diff = perturbed_data[[i, j]] - centroids[[label_idx, j]];
dist += diff * diff;
}
if dist < min_dist {
min_dist = dist;
best_label = label_idx;
}
}
perturbed_labels[i] = best_label;
}
let mut ari_input_true = Vec::new();
let mut ari_input_pred = Vec::new();
for i in 0..n_samples {
ari_input_true.push(original_labels[i]);
ari_input_pred.push(perturbed_labels[i]);
}
let ari_true = scirs2_core::ndarray::Array1::from_vec(ari_input_true);
let ari_pred = scirs2_core::ndarray::Array1::from_vec(ari_input_pred);
let ari = adjusted_rand_index(&ari_true, &ari_pred).expect("Operation failed");
stability_scores.push(F::from(ari).expect("Failed to convert to float"));
}
let sum = stability_scores.iter().fold(F::zero(), |acc, &x| acc + x);
let mean = sum / F::from(stability_scores.len()).expect("Operation failed");
Ok(mean)
}
#[allow(dead_code)]
pub fn consensus_score<S, D>(alllabels: &[&ArrayBase<S, D>]) -> Result<f64>
where
S: Data<Elem = usize>,
D: Dimension,
{
if alllabels.is_empty() {
return Err(MetricsError::InvalidInput(
"At least one clustering result is required".to_string(),
));
}
let n_clusterings = alllabels.len();
if n_clusterings < 2 {
return Err(MetricsError::InvalidInput(
"At least two clusterings are required for consensus score".to_string(),
));
}
let n_samples = alllabels[0].len();
for labels in alllabels.iter().skip(1) {
if labels.len() != n_samples {
return Err(MetricsError::InvalidInput(
"All clusterings must have the same number of samples".to_string(),
));
}
}
if n_samples <= 1 {
return Err(MetricsError::InvalidInput(
"Number of samples must be greater than 1".to_string(),
));
}
let mut consensus_values = vec![vec![0.0; n_samples]; n_samples];
for labels in alllabels {
for i in 0..n_samples {
for j in i..n_samples {
let label_i = labels.iter().nth(i).expect("Operation failed");
let label_j = labels.iter().nth(j).expect("Operation failed");
if label_i == label_j {
consensus_values[i][j] += 1.0;
if i != j {
consensus_values[j][i] += 1.0; }
}
}
}
}
for i in 0..n_samples {
for j in 0..n_samples {
consensus_values[i][j] /= n_clusterings as f64;
}
}
let mut total = 0.0;
let mut count = 0;
for i in 0..n_samples {
for j in (i + 1)..n_samples {
total += consensus_values[i][j];
count += 1;
}
}
let consensus = if count > 0 {
total / count as f64
} else {
1.0 };
Ok(consensus)
}
#[allow(dead_code)]
pub fn fold_stability<F, S1, S2, D>(
x: &ArrayBase<S1, Ix2>,
labels: &ArrayBase<S2, D>,
n_folds: Option<usize>,
fold_size: Option<f64>,
random_seed: Option<u64>,
) -> Result<F>
where
F: Float
+ NumCast
+ std::fmt::Debug
+ scirs2_core::ndarray::ScalarOperand
+ AddAssign
+ DivAssign,
S1: Data<Elem = F>,
S2: Data<Elem = usize>,
D: Dimension,
{
let n_samples = x.shape()[0];
if n_samples != labels.len() {
return Err(MetricsError::InvalidInput(format!(
"x has {} samples, but labels has {} samples",
n_samples,
labels.len()
)));
}
let n_folds = n_folds.unwrap_or(5);
let fold_size = fold_size.unwrap_or(0.8);
if n_folds < 2 {
return Err(MetricsError::InvalidInput(
"Number of _folds must be at least 2".to_string(),
));
}
if fold_size <= 0.0 || fold_size >= 1.0 {
return Err(MetricsError::InvalidInput(
"Fold _size must be between 0 and 1 (exclusive)".to_string(),
));
}
let _seed = random_seed.unwrap_or_else(scirs2_core::random::random::<u64>);
let mut rng = StdRng::seed_from_u64(_seed);
let mut unique_labels = Vec::new();
for &label in labels.iter() {
if !unique_labels.contains(&label) {
unique_labels.push(label);
}
}
let fold_sample_count = (n_samples as f64 * fold_size) as usize;
let mut fold_indices = Vec::new();
for _ in 0..n_folds {
let mut indices = Vec::new();
let mut available_indices: Vec<usize> = (0..n_samples).collect();
for i in (1..available_indices.len()).rev() {
let j = rng.random_range(0..=i);
available_indices.swap(i, j);
}
for i in 0..fold_sample_count.min(available_indices.len()) {
indices.push(available_indices[i]);
}
fold_indices.push(indices);
}
let mut centroids = HashMap::new();
let mut counts = HashMap::new();
for (i, &label) in labels.iter().enumerate() {
if !centroids.contains_key(&label) {
let centroid = Array1::zeros(x.ncols());
centroids.insert(label, centroid);
counts.insert(label, 0);
}
let centroid = centroids.get_mut(&label).expect("Operation failed");
for j in 0..x.ncols() {
centroid[j] += x[[i, j]];
}
*counts.get_mut(&label).expect("Operation failed") += 1;
}
for (&label, centroid) in centroids.iter_mut() {
let count = *counts.get(&label).expect("Operation failed");
if count > 0 {
for j in 0..centroid.len() {
centroid[j] /= F::from(count).expect("Failed to convert to float");
}
}
}
let mut stability_scores = Vec::new();
for fold_idx in &fold_indices {
let fold_size = fold_idx.len();
let mut fold_data = Array2::zeros((fold_size, x.ncols()));
let mut fold_labels = Vec::new();
for (i, &idx) in fold_idx.iter().enumerate() {
for j in 0..x.ncols() {
fold_data[[i, j]] = x[[idx, j]];
}
fold_labels.push(*labels.iter().nth(idx).expect("Operation failed"));
}
let mut predicted_labels = Vec::new();
for i in 0..fold_size {
let mut min_dist = F::infinity();
let mut best_label = 0;
for &label in &unique_labels {
let centroid = centroids.get(&label).expect("Operation failed");
let mut dist = F::zero();
for j in 0..x.ncols() {
let diff = fold_data[[i, j]] - centroid[j];
dist += diff * diff;
}
if dist < min_dist {
min_dist = dist;
best_label = label;
}
}
predicted_labels.push(best_label);
}
let true_labels = scirs2_core::ndarray::Array1::from_vec(fold_labels);
let pred_labels = scirs2_core::ndarray::Array1::from_vec(predicted_labels);
let jaccard = jaccard_similarity(&true_labels, &pred_labels).expect("Operation failed");
stability_scores.push(F::from(jaccard).expect("Failed to convert to float"));
}
let sum = stability_scores.iter().fold(F::zero(), |acc, &x| acc + x);
let mean = sum / F::from(stability_scores.len()).expect("Operation failed");
Ok(mean)
}
#[cfg(test)]
mod tests {
use super::*;
use approx::assert_abs_diff_eq;
use scirs2_core::ndarray::array;
#[test]
fn test_jaccard_similarity() {
let labels1 = array![0, 0, 1, 1, 2, 2];
let labels2 = array![0, 0, 1, 1, 2, 2];
let similarity = jaccard_similarity(&labels1, &labels2).expect("Operation failed");
assert_abs_diff_eq!(similarity, 1.0, epsilon = 1e-10);
let labels3 = array![0, 0, 1, 1, 2, 2];
let labels4 = array![1, 1, 0, 0, 2, 2];
let similarity = jaccard_similarity(&labels3, &labels4).expect("Operation failed");
assert!(similarity > 0.5);
let labels5 = array![0, 0, 0, 1, 1, 1];
let labels6 = array![0, 0, 1, 1, 2, 2];
let similarity = jaccard_similarity(&labels5, &labels6).expect("Operation failed");
assert!(similarity < 1.0);
let labels7 = array![0, 0, 0, 0, 0, 0];
let labels8 = array![0, 1, 2, 3, 4, 5];
let similarity = jaccard_similarity(&labels7, &labels8).expect("Operation failed");
assert!(similarity < 0.5); }
#[test]
fn test_consensus_score() {
let clustering1 = array![0, 0, 0, 1, 1, 1];
let clustering2 = array![0, 0, 0, 1, 1, 1];
let all_clusterings = vec![&clustering1, &clustering2];
let score = consensus_score(&all_clusterings).expect("Operation failed");
assert!(score > 0.0);
let clustering3 = array![0, 0, 0, 1, 1, 1];
let clustering4 = array![1, 1, 1, 0, 0, 0];
let all_clusterings = vec![&clustering3, &clustering4];
let score = consensus_score(&all_clusterings).expect("Operation failed");
assert!(score > 0.0);
let clustering5 = array![0, 0, 0, 1, 1, 1];
let clustering6 = array![1, 1, 1, 0, 0, 0]; let clustering7 = array![0, 0, 1, 1, 2, 2];
let all_clusterings = vec![&clustering5, &clustering6, &clustering7];
let score = consensus_score(&all_clusterings).expect("Operation failed");
assert!(score > 0.0);
assert!(score < 1.0);
}
#[test]
fn test_cluster_stability() {
let x = Array2::from_shape_vec(
(6, 2),
vec![1.0, 2.0, 1.5, 1.8, 1.2, 2.2, 5.0, 6.0, 5.2, 5.8, 5.5, 6.2],
)
.expect("Operation failed");
let labels = array![0, 0, 0, 1, 1, 1];
let stability =
cluster_stability(&x, &labels, Some(3), None, Some(42)).expect("Operation failed");
assert!(stability > 0.5);
let less_separated = Array2::from_shape_vec(
(6, 2),
vec![1.0, 2.0, 1.5, 1.8, 2.8, 3.0, 3.2, 3.5, 4.0, 4.2, 4.5, 5.0],
)
.expect("Operation failed");
let stability_less = cluster_stability(&less_separated, &labels, Some(3), None, Some(42))
.expect("Operation failed");
assert!(stability_less >= 0.0);
assert!(stability_less <= 1.0);
}
#[test]
fn test_fold_stability() {
let mut x_data = Vec::new();
let mut labels_data = Vec::new();
for i in 0..20 {
x_data.push(1.0 + (i as f64 % 5.0) * 0.1); x_data.push(2.0 + (i as f64 % 4.0) * 0.1); labels_data.push(0);
}
for i in 0..20 {
x_data.push(5.0 + (i as f64 % 5.0) * 0.1); x_data.push(6.0 + (i as f64 % 4.0) * 0.1); labels_data.push(1);
}
let x = Array2::from_shape_vec((40, 2), x_data).expect("Operation failed");
let labels = Array1::from_vec(labels_data);
let stability =
fold_stability(&x, &labels, Some(3), Some(0.7), Some(42)).expect("Operation failed");
assert!(stability > 0.7);
let invalid_folds = fold_stability(&x, &labels, Some(1), None, None);
assert!(invalid_folds.is_err());
let invalid_fold_size = fold_stability(&x, &labels, None, Some(1.2), None);
assert!(invalid_fold_size.is_err());
}
#[test]
fn test_with_invalid_inputs() {
let labels1 = array![0, 0, 1, 1, 2, 2];
let labels2 = array![0, 0, 1, 1];
let result = jaccard_similarity(&labels1, &labels2);
assert!(result.is_err());
let empty = array![] as scirs2_core::ndarray::Array1<usize>;
let singleton = array![0];
let result = jaccard_similarity(&empty, &empty);
assert!(result.is_err());
let result = jaccard_similarity(&singleton, &singleton);
assert!(result.is_err());
let empty_array: scirs2_core::ndarray::Array1<usize> =
scirs2_core::ndarray::Array1::zeros(0);
let result = consensus_score(&[&empty_array]);
assert!(result.is_err());
let result = consensus_score(&[&labels1]);
assert!(result.is_err());
let result = consensus_score(&[&labels1, &labels2]);
assert!(result.is_err());
}
}