use scirs2_core::ndarray::{Array1, ArrayBase, Data, Dimension, Ix1, Ix2};
use scirs2_core::numeric::{Float, NumCast};
use scirs2_core::simd_ops::SimdUnifiedOps;
use std::collections::{HashMap, HashSet};
use std::ops::{AddAssign, DivAssign};
use crate::error::{MetricsError, Result};
#[allow(dead_code)]
pub fn inter_cluster_distances<F, S1, S2, D>(
x: &ArrayBase<S1, Ix2>,
labels: &ArrayBase<S2, D>,
metric: &str,
) -> Result<HashMap<(usize, usize), F>>
where
F: Float
+ NumCast
+ std::fmt::Debug
+ scirs2_core::ndarray::ScalarOperand
+ AddAssign
+ DivAssign
+ SimdUnifiedOps,
S1: Data<Elem = F>,
S2: Data<Elem = usize>,
D: Dimension,
{
if !["euclidean", "manhattan", "cosine"].contains(&metric) {
return Err(MetricsError::InvalidInput(format!(
"Unsupported metric: {metric}. Supported metrics are 'euclidean', 'manhattan', and 'cosine'."
)));
}
let n_samples = x.shape()[0];
if n_samples != labels.len() {
return Err(MetricsError::InvalidInput(format!(
"x has {} samples, but labels has {} samples",
n_samples,
labels.len()
)));
}
let unique_set: HashSet<usize> = labels.iter().copied().collect();
let mut unique_labels: Vec<usize> = unique_set.into_iter().collect();
unique_labels.sort();
let n_features = x.shape()[1];
let mut centroids = HashMap::new();
for &label in &unique_labels {
let mut centroid = Array1::zeros(n_features);
let mut count = 0;
for (i, &sample_label) in labels.iter().enumerate() {
if sample_label == label {
let sample = x.slice(scirs2_core::ndarray::s![i, ..]);
centroid += &sample;
count += 1;
}
}
if count > 0 {
centroid /= F::from(count).expect("Failed to convert to float");
centroids.insert(label, centroid);
}
}
let mut distances = HashMap::new();
for (i, &label_i) in unique_labels.iter().enumerate() {
for &label_j in unique_labels.iter().skip(i + 1) {
let centroid_i = centroids.get(&label_i).expect("Operation failed");
let centroid_j = centroids.get(&label_j).expect("Operation failed");
let distance = match metric {
"euclidean" => euclidean_distance(centroid_i, centroid_j),
"manhattan" => manhattan_distance(centroid_i, centroid_j),
"cosine" => cosine_distance(centroid_i, centroid_j),
_ => {
return Err(MetricsError::InvalidInput(format!(
"Unsupported metric: {metric}"
)))
}
};
distances.insert((label_i, label_j), distance);
distances.insert((label_j, label_i), distance); }
}
Ok(distances)
}
#[allow(dead_code)]
pub fn intra_cluster_distances<F, S1, S2, D>(
x: &ArrayBase<S1, Ix2>,
labels: &ArrayBase<S2, D>,
metric: &str,
) -> Result<HashMap<usize, F>>
where
F: Float
+ NumCast
+ std::fmt::Debug
+ scirs2_core::ndarray::ScalarOperand
+ AddAssign
+ DivAssign
+ SimdUnifiedOps,
S1: Data<Elem = F>,
S2: Data<Elem = usize>,
D: Dimension,
{
if !["euclidean", "manhattan", "cosine"].contains(&metric) {
return Err(MetricsError::InvalidInput(format!(
"Unsupported metric: {metric}. Supported metrics are 'euclidean', 'manhattan', and 'cosine'."
)));
}
let n_samples = x.shape()[0];
if n_samples != labels.len() {
return Err(MetricsError::InvalidInput(format!(
"x has {} samples, but labels has {} samples",
n_samples,
labels.len()
)));
}
let unique_set: HashSet<usize> = labels.iter().copied().collect();
let mut unique_labels: Vec<usize> = unique_set.into_iter().collect();
unique_labels.sort();
let n_features = x.shape()[1];
let mut centroids = HashMap::new();
for &label in &unique_labels {
let mut centroid = Array1::zeros(n_features);
let mut count = 0;
for (i, &sample_label) in labels.iter().enumerate() {
if sample_label == label {
let sample = x.slice(scirs2_core::ndarray::s![i, ..]);
centroid += &sample;
count += 1;
}
}
if count > 0 {
centroid /= F::from(count).expect("Failed to convert to float");
centroids.insert(label, centroid);
}
}
let mut distances = HashMap::new();
for &label in &unique_labels {
let centroid = centroids.get(&label).expect("Operation failed");
let mut total_distance = F::zero();
let mut count = 0;
for (i, &sample_label) in labels.iter().enumerate() {
if sample_label == label {
let sample = x.slice(scirs2_core::ndarray::s![i, ..]);
let distance = match metric {
"euclidean" => euclidean_distance(&sample, centroid),
"manhattan" => manhattan_distance(&sample, centroid),
"cosine" => cosine_distance(&sample, centroid),
_ => {
return Err(MetricsError::InvalidInput(format!(
"Unsupported metric: {metric}"
)))
}
};
total_distance += distance;
count += 1;
}
}
if count > 0 {
let avg_distance = total_distance / F::from(count).expect("Failed to convert to float");
distances.insert(label, avg_distance);
}
}
Ok(distances)
}
#[allow(dead_code)]
pub fn distance_ratio_index<F, S1, S2, D>(
x: &ArrayBase<S1, Ix2>,
labels: &ArrayBase<S2, D>,
metric: &str,
) -> Result<F>
where
F: Float
+ NumCast
+ std::fmt::Debug
+ scirs2_core::ndarray::ScalarOperand
+ AddAssign
+ DivAssign
+ SimdUnifiedOps,
S1: Data<Elem = F>,
S2: Data<Elem = usize>,
D: Dimension,
{
let inter_distances = inter_cluster_distances(x, labels, metric)?;
let intra_distances = intra_cluster_distances(x, labels, metric)?;
let unique_set: HashSet<usize> = labels.iter().copied().collect();
let mut unique_labels: Vec<usize> = unique_set.into_iter().collect();
unique_labels.sort();
if unique_labels.len() <= 1 {
return Ok(F::infinity());
}
let mut cluster_ratios = Vec::new();
for (i, &label_i) in unique_labels.iter().enumerate() {
let mut max_ratio = F::zero();
for &label_j in unique_labels.iter().skip(i + 1) {
if label_i == label_j {
continue;
}
let intra_i = *intra_distances.get(&label_i).unwrap_or(&F::zero());
let intra_j = *intra_distances.get(&label_j).unwrap_or(&F::zero());
let inter_ij = *inter_distances
.get(&(label_i, label_j))
.unwrap_or(&F::infinity());
let ratio = (intra_i + intra_j) / inter_ij;
if ratio > max_ratio {
max_ratio = ratio;
}
}
cluster_ratios.push(max_ratio);
}
let sum_ratios = cluster_ratios.iter().fold(F::zero(), |acc, &x| acc + x);
let avg_ratio = sum_ratios / F::from(cluster_ratios.len()).expect("Operation failed");
Ok(avg_ratio)
}
#[allow(dead_code)]
pub fn isolation_index<F, S1, S2, D>(
x: &ArrayBase<S1, Ix2>,
labels: &ArrayBase<S2, D>,
metric: &str,
) -> Result<F>
where
F: Float
+ NumCast
+ std::fmt::Debug
+ scirs2_core::ndarray::ScalarOperand
+ AddAssign
+ DivAssign
+ SimdUnifiedOps,
S1: Data<Elem = F>,
S2: Data<Elem = usize>,
D: Dimension,
{
let inter_distances = inter_cluster_distances(x, labels, metric)?;
let intra_distances = intra_cluster_distances(x, labels, metric)?;
let unique_set: HashSet<usize> = labels.iter().copied().collect();
let mut unique_labels: Vec<usize> = unique_set.into_iter().collect();
unique_labels.sort();
if unique_labels.len() <= 1 {
return Ok(F::zero());
}
let min_inter =
inter_distances.values().fold(
F::infinity(),
|min_dist, &dist| {
if dist < min_dist {
dist
} else {
min_dist
}
},
);
let max_intra =
intra_distances.values().fold(
F::zero(),
|max_dist, &dist| {
if dist > max_dist {
dist
} else {
max_dist
}
},
);
let isolation = if max_intra > F::zero() {
min_inter / max_intra
} else {
F::infinity() };
Ok(isolation)
}
#[allow(dead_code)]
fn euclidean_distance<F, S1, S2>(x: &ArrayBase<S1, Ix1>, y: &ArrayBase<S2, Ix1>) -> F
where
F: Float + scirs2_core::ndarray::ScalarOperand + SimdUnifiedOps,
S1: Data<Elem = F>,
S2: Data<Elem = F>,
{
if x.is_standard_layout() && y.is_standard_layout() {
let diff = F::simd_sub(&x.view(), &y.view());
let squared_diff = F::simd_mul(&diff.view(), &diff.view());
F::simd_sum(&squared_diff.view()).sqrt()
} else {
let mut sum_sq = F::zero();
for (a, b) in x.iter().zip(y.iter()) {
let diff = *a - *b;
sum_sq = sum_sq + diff * diff;
}
sum_sq.sqrt()
}
}
#[allow(dead_code)]
fn manhattan_distance<F, S1, S2>(x: &ArrayBase<S1, Ix1>, y: &ArrayBase<S2, Ix1>) -> F
where
F: Float + scirs2_core::ndarray::ScalarOperand + SimdUnifiedOps,
S1: Data<Elem = F>,
S2: Data<Elem = F>,
{
if x.is_standard_layout() && y.is_standard_layout() {
let diff = F::simd_sub(&x.view(), &y.view());
let abs_diff = F::simd_abs(&diff.view());
F::simd_sum(&abs_diff.view())
} else {
let mut sum_abs = F::zero();
for (a, b) in x.iter().zip(y.iter()) {
let diff = *a - *b;
sum_abs = sum_abs + diff.abs();
}
sum_abs
}
}
#[allow(dead_code)]
fn cosine_distance<F, S1, S2>(x: &ArrayBase<S1, Ix1>, y: &ArrayBase<S2, Ix1>) -> F
where
F: Float + scirs2_core::ndarray::ScalarOperand + SimdUnifiedOps,
S1: Data<Elem = F>,
S2: Data<Elem = F>,
{
let (dot_product, norm_x, norm_y) = if x.is_standard_layout() && y.is_standard_layout() {
let xy = F::simd_mul(&x.view(), &y.view());
let dot_product = F::simd_sum(&xy.view());
let x_squared = F::simd_mul(&x.view(), &x.view());
let norm_x_sq = F::simd_sum(&x_squared.view());
let y_squared = F::simd_mul(&y.view(), &y.view());
let norm_y_sq = F::simd_sum(&y_squared.view());
(dot_product, norm_x_sq.sqrt(), norm_y_sq.sqrt())
} else {
let mut dot_product = F::zero();
let mut norm_x = F::zero();
let mut norm_y = F::zero();
for (a, b) in x.iter().zip(y.iter()) {
dot_product = dot_product + (*a * *b);
norm_x = norm_x + (*a * *a);
norm_y = norm_y + (*b * *b);
}
(dot_product, norm_x.sqrt(), norm_y.sqrt())
};
if norm_x > F::zero() && norm_y > F::zero() {
F::one() - (dot_product / (norm_x * norm_y))
} else {
F::one() }
}
#[cfg(test)]
mod tests {
use super::*;
use approx::assert_abs_diff_eq;
use scirs2_core::ndarray::{array, Array2};
#[test]
fn test_inter_cluster_distances_euclidean() {
let x = Array2::from_shape_vec(
(6, 2),
vec![1.0, 2.0, 1.5, 1.8, 1.2, 2.2, 5.0, 6.0, 5.2, 5.8, 5.5, 6.2],
)
.expect("Operation failed");
let labels = array![0, 0, 0, 1, 1, 1];
let distances =
inter_cluster_distances(&x, &labels, "euclidean").expect("Operation failed");
let dist_0_1 = distances.get(&(0, 1)).expect("Operation failed");
assert!(*dist_0_1 > 4.0);
let dist_1_0 = distances.get(&(1, 0)).expect("Operation failed");
assert_abs_diff_eq!(*dist_0_1, *dist_1_0, epsilon = 1e-10);
}
#[test]
fn test_intra_cluster_distances_euclidean() {
let x = Array2::from_shape_vec(
(6, 2),
vec![1.0, 2.0, 1.5, 1.8, 1.2, 2.2, 5.0, 6.0, 5.2, 5.8, 5.5, 6.2],
)
.expect("Operation failed");
let labels = array![0, 0, 0, 1, 1, 1];
let distances =
intra_cluster_distances(&x, &labels, "euclidean").expect("Operation failed");
let dist_0 = distances.get(&0).expect("Operation failed");
let dist_1 = distances.get(&1).expect("Operation failed");
assert!(*dist_0 < 1.0);
assert!(*dist_1 < 1.0);
}
#[test]
fn test_distance_ratio_index() {
let x = Array2::from_shape_vec(
(6, 2),
vec![1.0, 2.0, 1.5, 1.8, 1.2, 2.2, 5.0, 6.0, 5.2, 5.8, 5.5, 6.2],
)
.expect("Operation failed");
let labels = array![0, 0, 0, 1, 1, 1];
let index = distance_ratio_index(&x, &labels, "euclidean").expect("Operation failed");
assert!(index < 0.5);
}
#[test]
fn test_isolation_index() {
let x = Array2::from_shape_vec(
(6, 2),
vec![1.0, 2.0, 1.5, 1.8, 1.2, 2.2, 5.0, 6.0, 5.2, 5.8, 5.5, 6.2],
)
.expect("Operation failed");
let labels = array![0, 0, 0, 1, 1, 1];
let index = isolation_index(&x, &labels, "euclidean").expect("Operation failed");
assert!(index > 2.0);
}
}