use scirs2_core::ndarray::{Array1, Array2, ArrayBase, Data, Dimension, Ix2};
use scirs2_core::numeric::{Float, NumCast};
use std::collections::HashMap;
use super::{calculate_distance, group_by_labels, pairwise_distances};
use crate::error::{MetricsError, Result};
#[derive(Debug, Clone)]
pub struct SilhouetteAnalysis<F: Float> {
pub sample_values: Vec<F>,
pub mean_score: F,
pub cluster_scores: HashMap<usize, F>,
pub sorted_indices: Vec<usize>,
pub cluster_mapping: HashMap<usize, usize>,
pub cluster_sizes: Vec<usize>,
}
#[allow(dead_code)]
pub fn silhouette_score<F, S1, S2, D>(
x: &ArrayBase<S1, Ix2>,
labels: &ArrayBase<S2, D>,
metric: &str,
) -> Result<F>
where
F: Float + NumCast + std::fmt::Debug + scirs2_core::ndarray::ScalarOperand + 'static,
S1: Data<Elem = F>,
S2: Data<Elem = usize>,
D: Dimension,
{
let silhouette_results = silhouette_analysis(x, labels, metric)?;
Ok(silhouette_results.mean_score)
}
#[allow(dead_code)]
pub fn silhouette_samples<F, S1, S2, D>(
x: &ArrayBase<S1, Ix2>,
labels: &ArrayBase<S2, D>,
metric: &str,
) -> Result<Vec<F>>
where
F: Float + NumCast + std::fmt::Debug + scirs2_core::ndarray::ScalarOperand + 'static,
S1: Data<Elem = F>,
S2: Data<Elem = usize>,
D: Dimension,
{
let analysis = silhouette_analysis(x, labels, metric)?;
Ok(analysis.sample_values)
}
#[allow(dead_code)]
pub fn silhouette_scores_per_cluster<F, S1, S2, D>(
x: &ArrayBase<S1, Ix2>,
labels: &ArrayBase<S2, D>,
metric: &str,
) -> Result<HashMap<usize, F>>
where
F: Float + NumCast + std::fmt::Debug + scirs2_core::ndarray::ScalarOperand + 'static,
S1: Data<Elem = F>,
S2: Data<Elem = usize>,
D: Dimension,
{
let analysis = silhouette_analysis(x, labels, metric)?;
Ok(analysis.cluster_scores)
}
#[allow(dead_code)]
pub fn silhouette_analysis<F, S1, S2, D>(
x: &ArrayBase<S1, Ix2>,
labels: &ArrayBase<S2, D>,
metric: &str,
) -> Result<SilhouetteAnalysis<F>>
where
F: Float + NumCast + std::fmt::Debug + scirs2_core::ndarray::ScalarOperand + 'static,
S1: Data<Elem = F>,
S2: Data<Elem = usize>,
D: Dimension,
{
if metric != "euclidean" {
return Err(MetricsError::InvalidInput(format!(
"Unsupported metric: {metric}. Only 'euclidean' is currently supported."
)));
}
let n_samples = x.shape()[0];
if n_samples != labels.len() {
return Err(MetricsError::InvalidInput(format!(
"x has {} samples, but labels has {} samples",
n_samples,
labels.len()
)));
}
if n_samples < 2 {
return Err(MetricsError::InvalidInput(
"n_samples must be at least 2".to_string(),
));
}
let clusters = group_by_labels(x, labels)?;
if clusters.len() < 2 {
return Err(MetricsError::InvalidInput(
"Number of labels is 1. Silhouette analysis is undefined for a single cluster."
.to_string(),
));
}
let empty_clusters: Vec<_> = clusters
.iter()
.filter(|(_, samples)| samples.is_empty())
.map(|(&label, _)| label)
.collect();
if !empty_clusters.is_empty() {
return Err(MetricsError::InvalidInput(format!(
"Empty clusters found: {empty_clusters:?}"
)));
}
let distances = pairwise_distances(x, metric)?;
let mut silhouette_values = Vec::with_capacity(n_samples);
let mut sample_clusters = Vec::with_capacity(n_samples);
for i in 0..n_samples {
let label_i = labels.iter().nth(i).ok_or_else(|| {
MetricsError::InvalidInput(format!("Could not access index {i} in labels"))
})?;
let cluster_i = &clusters[label_i];
sample_clusters.push(*label_i);
let mut a = F::zero();
let mut count_a = 0;
for &j in cluster_i {
if i == j {
continue;
}
a = a + distances[[i, j]];
count_a += 1;
}
if count_a > 0 {
a = a / F::from(count_a).expect("Failed to convert to float");
}
let mut b = None;
for (label_j, cluster_j) in &clusters {
if *label_j == *label_i {
continue;
}
let mut cluster_dist = F::zero();
for &j in cluster_j {
cluster_dist = cluster_dist + distances[[i, j]];
}
let cluster_dist = cluster_dist / F::from(cluster_j.len()).expect("Operation failed");
if let Some(current_b) = b {
if cluster_dist < current_b {
b = Some(cluster_dist);
}
} else {
b = Some(cluster_dist);
}
}
let s = if let Some(b) = b {
if a < b {
F::one() - a / b
} else if a > b {
b / a - F::one()
} else {
F::zero()
}
} else {
F::zero() };
silhouette_values.push(s);
}
let sum = silhouette_values
.iter()
.fold(F::zero(), |acc, &val| acc + val);
let mean_score = sum / F::from(n_samples).expect("Failed to convert to float");
let mut cluster_scores = HashMap::new();
for (label, indices) in &clusters {
let mut cluster_sum = F::zero();
for &idx in indices {
cluster_sum = cluster_sum + silhouette_values[idx];
}
let cluster_mean = cluster_sum / F::from(indices.len()).expect("Operation failed");
cluster_scores.insert(*label, cluster_mean);
}
let unique_labels: Vec<_> = clusters.keys().cloned().collect();
let mut cluster_mapping = HashMap::new();
for (i, &label) in unique_labels.iter().enumerate() {
cluster_mapping.insert(label, i);
}
let mut cluster_sizes = vec![0; cluster_mapping.len()];
for (label, indices) in &clusters {
let mapped_idx = cluster_mapping[label];
cluster_sizes[mapped_idx] = indices.len();
}
let mut samples_with_scores: Vec<(usize, F, usize)> = silhouette_values
.iter()
.enumerate()
.map(|(i, &s)| (i, s, cluster_mapping[&sample_clusters[i]]))
.collect();
samples_with_scores.sort_by(|a, b| {
a.2.cmp(&b.2)
.then(b.1.partial_cmp(&a.1).expect("Operation failed"))
});
let sorted_indices = samples_with_scores.iter().map(|&(i, _, _)| i).collect();
Ok(SilhouetteAnalysis {
sample_values: silhouette_values,
mean_score,
cluster_scores,
sorted_indices,
cluster_mapping,
cluster_sizes,
})
}
#[allow(dead_code)]
pub fn davies_bouldin_score<F, S1, S2, D>(
x: &ArrayBase<S1, Ix2>,
labels: &ArrayBase<S2, D>,
) -> Result<F>
where
F: Float + NumCast + std::fmt::Debug + scirs2_core::ndarray::ScalarOperand + 'static,
S1: Data<Elem = F>,
S2: Data<Elem = usize>,
D: Dimension,
{
let n_samples = x.shape()[0];
if n_samples != labels.len() {
return Err(MetricsError::InvalidInput(format!(
"x has {} samples, but labels has {} samples",
n_samples,
labels.len()
)));
}
if n_samples < 2 {
return Err(MetricsError::InvalidInput(
"n_samples must be at least 2".to_string(),
));
}
let clusters = group_by_labels(x, labels)?;
if clusters.len() < 2 {
return Err(MetricsError::InvalidInput(
"Number of labels is 1. Davies-Bouldin index is undefined for a single cluster."
.to_string(),
));
}
let mut centroids = HashMap::new();
for (&label, indices) in &clusters {
let mut centroid = Array1::<F>::zeros(x.shape()[1]);
for &idx in indices {
centroid = centroid + x.row(idx).to_owned();
}
centroid = centroid / F::from(indices.len()).expect("Operation failed");
centroids.insert(label, centroid);
}
let mut avg_distances = HashMap::new();
for (&label, indices) in &clusters {
let centroid = centroids.get(&label).expect("Operation failed");
let mut total_distance = F::zero();
for &idx in indices {
total_distance = total_distance
+ calculate_distance(&x.row(idx).to_vec(), ¢roid.to_vec(), "euclidean")?;
}
let avg_distance = total_distance / F::from(indices.len()).expect("Operation failed");
avg_distances.insert(label, avg_distance);
}
let mut db_index = F::zero();
let labels_vec: Vec<_> = clusters.keys().cloned().collect();
for i in 0..labels_vec.len() {
let label_i = labels_vec[i];
let centroid_i = centroids.get(&label_i).expect("Operation failed");
let avg_dist_i = avg_distances.get(&label_i).expect("Operation failed");
let mut max_ratio = F::zero();
for (j, &label_j) in labels_vec.iter().enumerate() {
if i == j {
continue;
}
let centroid_j = centroids.get(&label_j).expect("Operation failed");
let avg_dist_j = avg_distances.get(&label_j).expect("Operation failed");
let centroid_dist =
calculate_distance(¢roid_i.to_vec(), ¢roid_j.to_vec(), "euclidean")?;
let ratio = (*avg_dist_i + *avg_dist_j) / centroid_dist;
if ratio > max_ratio {
max_ratio = ratio;
}
}
db_index = db_index + max_ratio;
}
Ok(db_index / F::from(labels_vec.len()).expect("Operation failed"))
}
#[allow(dead_code)]
pub fn calinski_harabasz_score<F, S1, S2, D>(
x: &ArrayBase<S1, Ix2>,
labels: &ArrayBase<S2, D>,
) -> Result<F>
where
F: Float + NumCast + std::fmt::Debug + scirs2_core::ndarray::ScalarOperand + 'static,
S1: Data<Elem = F>,
S2: Data<Elem = usize>,
D: Dimension,
{
let n_samples = x.shape()[0];
if n_samples != labels.len() {
return Err(MetricsError::InvalidInput(format!(
"x has {} samples, but labels has {} samples",
n_samples,
labels.len()
)));
}
if n_samples < 2 {
return Err(MetricsError::InvalidInput(
"n_samples must be at least 2".to_string(),
));
}
let clusters = group_by_labels(x, labels)?;
if clusters.len() < 2 {
return Err(MetricsError::InvalidInput(
"Number of labels is 1. Calinski-Harabasz index is undefined for a single cluster."
.to_string(),
));
}
let mut global_centroid = Array1::<F>::zeros(x.shape()[1]);
for i in 0..n_samples {
global_centroid = global_centroid + x.row(i).to_owned();
}
global_centroid = global_centroid / F::from(n_samples).expect("Failed to convert to float");
let mut centroids = HashMap::new();
for (&label, indices) in &clusters {
let mut centroid = Array1::<F>::zeros(x.shape()[1]);
for &idx in indices {
centroid = centroid + x.row(idx).to_owned();
}
centroid = centroid / F::from(indices.len()).expect("Operation failed");
centroids.insert(label, centroid);
}
let mut between_disp = F::zero();
for (label, indices) in &clusters {
let cluster_size = F::from(indices.len()).expect("Operation failed");
let centroid = centroids.get(label).expect("Operation failed");
let mut squared_dist = F::zero();
for (c, g) in centroid.iter().zip(global_centroid.iter()) {
let diff = *c - *g;
squared_dist = squared_dist + diff * diff;
}
between_disp = between_disp + cluster_size * squared_dist;
}
let mut within_disp = F::zero();
for (label, indices) in &clusters {
let centroid = centroids.get(label).expect("Operation failed");
let mut cluster_disp = F::zero();
for &idx in indices {
let mut squared_dist = F::zero();
for (x_val, c_val) in x.row(idx).iter().zip(centroid.iter()) {
let diff = *x_val - *c_val;
squared_dist = squared_dist + diff * diff;
}
cluster_disp = cluster_disp + squared_dist;
}
within_disp = within_disp + cluster_disp;
}
if within_disp <= F::epsilon() {
return Err(MetricsError::CalculationError(
"Within-cluster dispersion is zero".to_string(),
));
}
let n_clusters = F::from(clusters.len()).expect("Operation failed");
Ok(
between_disp * (F::from(n_samples - clusters.len()).expect("Operation failed"))
/ (within_disp * (n_clusters - F::one())),
)
}
#[allow(dead_code)]
pub fn dunn_index<F, S1, S2, D>(x: &ArrayBase<S1, Ix2>, labels: &ArrayBase<S2, D>) -> Result<F>
where
F: Float + NumCast + std::fmt::Debug + 'static,
S1: Data<Elem = F>,
S2: Data<Elem = usize>,
D: Dimension,
{
let n_samples = x.shape()[0];
if n_samples != labels.len() {
return Err(MetricsError::InvalidInput(format!(
"x has {} samples, but labels has {} samples",
n_samples,
labels.len()
)));
}
if n_samples < 2 {
return Err(MetricsError::InvalidInput(
"n_samples must be at least 2".to_string(),
));
}
let clusters = group_by_labels(x, labels)?;
if clusters.len() < 2 {
return Err(MetricsError::InvalidInput(
"Number of labels is 1. Dunn index is undefined for a single cluster.".to_string(),
));
}
let distances = pairwise_distances(x, "euclidean")?;
let mut max_intra_cluster_distance = F::zero();
for indices in clusters.values() {
let mut max_distance = F::zero();
for (i, &idx1) in indices.iter().enumerate() {
for &idx2 in &indices[i + 1..] {
let dist = distances[[idx1, idx2]];
if dist > max_distance {
max_distance = dist;
}
}
}
if max_distance > max_intra_cluster_distance {
max_intra_cluster_distance = max_distance;
}
}
if max_intra_cluster_distance <= F::epsilon() {
return Err(MetricsError::CalculationError(
"Maximum intra-cluster distance is zero".to_string(),
));
}
let mut min_inter_cluster_distance = F::infinity();
let cluster_labels: Vec<_> = clusters.keys().collect();
for i in 0..cluster_labels.len() {
for j in i + 1..cluster_labels.len() {
let cluster_i = &clusters[cluster_labels[i]];
let cluster_j = &clusters[cluster_labels[j]];
let mut min_distance = F::infinity();
for &idx1 in cluster_i {
for &idx2 in cluster_j {
let dist = distances[[idx1, idx2]];
if dist < min_distance {
min_distance = dist;
}
}
}
if min_distance < min_inter_cluster_distance {
min_inter_cluster_distance = min_distance;
}
}
}
Ok(min_inter_cluster_distance / max_intra_cluster_distance)
}