use scirs2_core::ndarray::{ArrayBase, Data, Dimension, Ix1, Ix2};
use scirs2_core::numeric::{Float, NumCast};
use std::collections::HashMap;
use std::ops::{AddAssign, DivAssign};
use crate::error::{MetricsError, Result};
#[allow(dead_code)]
pub fn local_density_factor<F, S1, S2, D>(
x: &ArrayBase<S1, Ix2>,
labels: &ArrayBase<S2, D>,
k: Option<usize>,
) -> Result<HashMap<usize, F>>
where
F: Float
+ NumCast
+ std::fmt::Debug
+ scirs2_core::ndarray::ScalarOperand
+ AddAssign
+ DivAssign,
S1: Data<Elem = F>,
S2: Data<Elem = usize>,
D: Dimension,
{
let n_samples = x.shape()[0];
if n_samples != labels.len() {
return Err(MetricsError::InvalidInput(format!(
"x has {} samples, but labels has {} samples",
n_samples,
labels.len()
)));
}
let k = k.unwrap_or_else(|| {
if n_samples <= 1 {
1
} else if n_samples < 10 {
std::cmp::min(2, n_samples - 1)
} else {
std::cmp::min(5, n_samples / 10)
}
});
let mut unique_labels = Vec::new();
for &label in labels.iter() {
if !unique_labels.contains(&label) {
unique_labels.push(label);
}
}
unique_labels.sort();
let mut all_knn_distances = Vec::new();
let mut cluster_idx = HashMap::new();
for label in &unique_labels {
cluster_idx.insert(*label, Vec::new());
}
for (i, &label) in labels.iter().enumerate() {
if let Some(indices) = cluster_idx.get_mut(&label) {
indices.push(i);
}
}
for i in 0..n_samples {
let current_point = x.row(i);
let mut distances = Vec::new();
for j in 0..n_samples {
if i != j {
let other_point = x.row(j);
let dist = calculate_euclidean_distance(¤t_point, &other_point);
distances.push(dist);
}
}
distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let k_distance = if distances.len() >= k {
distances[k - 1] } else if !distances.is_empty() {
distances[distances.len() - 1] } else {
F::zero() };
all_knn_distances.push((i, k_distance));
}
let mut ldf = HashMap::new();
for &label in &unique_labels {
let cluster_indices = cluster_idx.get(&label).expect("Operation failed");
if cluster_indices.is_empty() {
continue;
}
let mut cluster_knn_sum = F::zero();
let mut count = 0;
for &idx in cluster_indices {
cluster_knn_sum += all_knn_distances[idx].1;
count += 1;
}
let avg_knn = if count > 0 {
cluster_knn_sum / F::from(count).expect("Failed to convert to float")
} else {
F::zero()
};
let factor = if avg_knn > F::zero() {
F::one() / avg_knn
} else {
F::zero()
};
ldf.insert(label, factor);
}
Ok(ldf)
}
#[allow(dead_code)]
pub fn relative_density_index<F, S1, S2, D>(
x: &ArrayBase<S1, Ix2>,
labels: &ArrayBase<S2, D>,
k: Option<usize>,
) -> Result<F>
where
F: Float
+ NumCast
+ std::fmt::Debug
+ scirs2_core::ndarray::ScalarOperand
+ AddAssign
+ DivAssign,
S1: Data<Elem = F>,
S2: Data<Elem = usize>,
D: Dimension,
{
let n_samples = x.shape()[0];
if n_samples != labels.len() {
return Err(MetricsError::InvalidInput(format!(
"x has {} samples, but labels has {} samples",
n_samples,
labels.len()
)));
}
let mut unique_labels = Vec::new();
for &label in labels.iter() {
if !unique_labels.contains(&label) {
unique_labels.push(label);
}
}
unique_labels.sort();
let k = k.unwrap_or_else(|| {
if n_samples <= 1 {
1
} else if n_samples < 10 {
std::cmp::min(2, n_samples - 1)
} else {
std::cmp::min(5, n_samples / 10)
}
});
let mut intra_density_sum = F::zero();
let mut inter_density_sum = F::zero();
for (i, &label_i) in labels.iter().enumerate() {
let mut intra_distances = Vec::new();
let mut inter_distances = Vec::new();
for (j, &label_j) in labels.iter().enumerate() {
if i != j {
let dist = calculate_euclidean_distance(&x.row(i), &x.row(j));
if label_i == label_j {
intra_distances.push(dist);
} else {
inter_distances.push(dist);
}
}
}
intra_distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let intra_k = std::cmp::min(k, intra_distances.len());
if intra_k > 0 {
let knn_intra_dist = intra_distances[intra_k - 1];
let intra_density = if knn_intra_dist > F::zero() {
F::one() / knn_intra_dist
} else {
F::zero()
};
intra_density_sum += intra_density;
}
inter_distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let inter_k = std::cmp::min(k, inter_distances.len());
if inter_k > 0 {
let knn_inter_dist = inter_distances[inter_k - 1];
let inter_density = if knn_inter_dist > F::zero() {
F::one() / knn_inter_dist
} else {
F::zero()
};
inter_density_sum += inter_density;
}
}
let avg_intra_density = if n_samples > 0 {
intra_density_sum / F::from(n_samples).expect("Failed to convert to float")
} else {
F::zero()
};
let avg_inter_density = if n_samples > 0 {
inter_density_sum / F::from(n_samples).expect("Failed to convert to float")
} else {
F::zero()
};
let rdi = if avg_inter_density > F::zero() {
avg_intra_density / avg_inter_density
} else if avg_intra_density > F::zero() {
F::max_value() } else {
F::one() };
Ok(rdi)
}
#[allow(dead_code)]
pub fn density_based_cluster_validity<F, S1, S2, D>(
x: &ArrayBase<S1, Ix2>,
labels: &ArrayBase<S2, D>,
k: Option<usize>,
) -> Result<F>
where
F: Float
+ NumCast
+ std::fmt::Debug
+ scirs2_core::ndarray::ScalarOperand
+ AddAssign
+ DivAssign,
S1: Data<Elem = F>,
S2: Data<Elem = usize>,
D: Dimension,
{
let n_samples = x.shape()[0];
if n_samples != labels.len() {
return Err(MetricsError::InvalidInput(format!(
"x has {} samples, but labels has {} samples",
n_samples,
labels.len()
)));
}
let k = k.unwrap_or_else(|| {
if n_samples <= 1 {
1
} else if n_samples < 10 {
std::cmp::min(2, n_samples - 1)
} else {
std::cmp::min(5, n_samples / 10)
}
});
let mut unique_labels = Vec::new();
for &label in labels.iter() {
if !unique_labels.contains(&label) {
unique_labels.push(label);
}
}
if unique_labels.len() < 2 {
return Err(MetricsError::InvalidInput(
"At least two clusters are required to calculate DBCV".to_string(),
));
}
unique_labels.sort();
let mut cluster_indices = HashMap::new();
for label in &unique_labels {
cluster_indices.insert(*label, Vec::new());
}
for (i, &label) in labels.iter().enumerate() {
if let Some(indices) = cluster_indices.get_mut(&label) {
indices.push(i);
}
}
let mut sparseness_values = Vec::new();
for &label in &unique_labels {
let indices = cluster_indices.get(&label).expect("Operation failed");
if indices.len() <= 1 {
sparseness_values.push(F::zero());
continue;
}
let mut core_distances = Vec::new();
for &i in indices {
let mut distances = Vec::new();
for &j in indices {
if i != j {
let dist = calculate_euclidean_distance(&x.row(i), &x.row(j));
distances.push(dist);
}
}
distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let k_actual = std::cmp::min(k, distances.len());
if k_actual > 0 {
core_distances.push(distances[k_actual - 1]);
} else {
core_distances.push(F::zero());
}
}
let avg_core_distance = if !core_distances.is_empty() {
core_distances.iter().fold(F::zero(), |sum, &val| sum + val)
/ F::from(core_distances.len()).expect("Operation failed")
} else {
F::zero()
};
let variance = if core_distances.len() > 1 {
let mean = avg_core_distance;
let sum_squared_diff = core_distances
.iter()
.fold(F::zero(), |sum, &val| sum + (val - mean) * (val - mean));
sum_squared_diff / F::from(core_distances.len() - 1).expect("Operation failed")
} else {
F::zero()
};
let sparseness = avg_core_distance * (F::one() + variance.sqrt());
sparseness_values.push(sparseness);
}
let mut separation_matrix = vec![vec![F::zero(); unique_labels.len()]; unique_labels.len()];
for (i, &label_i) in unique_labels.iter().enumerate() {
let indices_i = cluster_indices.get(&label_i).expect("Operation failed");
for (j, &label_j) in unique_labels.iter().enumerate() {
if i == j {
continue;
}
let indices_j = cluster_indices.get(&label_j).expect("Operation failed");
let mut min_dist = F::max_value();
for &idx_i in indices_i {
for &idx_j in indices_j {
let dist = calculate_euclidean_distance(&x.row(idx_i), &x.row(idx_j));
min_dist = F::min(min_dist, dist);
}
}
separation_matrix[i][j] = min_dist;
}
}
let mut cluster_validity = Vec::new();
for (i, &_) in unique_labels.iter().enumerate() {
let cluster_sparseness = sparseness_values[i];
let mut min_separation = F::max_value();
for j in 0..unique_labels.len() {
if i != j && separation_matrix[i][j] < min_separation {
min_separation = separation_matrix[i][j];
}
}
if min_separation == F::max_value() {
min_separation = F::zero();
}
let validity = if min_separation > F::zero() || cluster_sparseness > F::zero() {
(min_separation - cluster_sparseness) / F::max(min_separation, cluster_sparseness)
} else {
F::zero()
};
cluster_validity.push(validity);
}
let mut weighted_sum = F::zero();
let mut weight_sum = F::zero();
for (i, &label) in unique_labels.iter().enumerate() {
let weight = F::from(
cluster_indices
.get(&label)
.expect("Failed to convert to float")
.len(),
)
.expect("Operation failed");
weighted_sum += weight * cluster_validity[i];
weight_sum += weight;
}
let dbcv = if weight_sum > F::zero() {
weighted_sum / weight_sum
} else {
F::zero()
};
Ok(dbcv)
}
#[allow(dead_code)]
fn calculate_euclidean_distance<F, S1, S2>(a: &ArrayBase<S1, Ix1>, b: &ArrayBase<S2, Ix1>) -> F
where
F: Float,
S1: Data<Elem = F>,
S2: Data<Elem = F>,
{
let mut sum = F::zero();
for (x, y) in a.iter().zip(b.iter()) {
let diff = *x - *y;
sum = sum + diff * diff;
}
sum.sqrt()
}
#[cfg(test)]
mod tests {
use super::*;
use scirs2_core::ndarray::Array2;
#[test]
fn test_local_density_factor() {
let well_separated = Array2::from_shape_vec(
(6, 2),
vec![
1.0, 2.0, 1.5, 1.8, 1.2, 2.2, 10.0, 12.0, 10.2, 11.8, 10.5, 12.2,
],
)
.expect("Operation failed");
let labels = scirs2_core::ndarray::array![0, 0, 0, 1, 1, 1];
let factors =
local_density_factor(&well_separated, &labels, Some(2)).expect("Operation failed");
assert!(factors.len() == 2);
assert!(factors.contains_key(&0));
assert!(factors.contains_key(&1));
let varying_density = Array2::from_shape_vec(
(6, 2),
vec![
1.0, 1.1, 1.05, 1.05, 1.1, 1.0, 5.0, 5.0, 6.0, 6.0, 7.0, 7.0, ],
)
.expect("Operation failed");
let labels = scirs2_core::ndarray::array![0, 0, 0, 1, 1, 1];
let factors =
local_density_factor(&varying_density, &labels, Some(2)).expect("Operation failed");
assert!(
*factors.get(&0).expect("Operation failed")
> *factors.get(&1).expect("Operation failed")
);
}
#[test]
fn test_relative_density_index() {
let well_separated = Array2::from_shape_vec(
(6, 2),
vec![
1.0, 2.0, 1.5, 1.8, 1.2, 2.2, 10.0, 12.0, 10.2, 11.8, 10.5, 12.2,
],
)
.expect("Operation failed");
let labels = scirs2_core::ndarray::array![0, 0, 0, 1, 1, 1];
let rdi =
relative_density_index(&well_separated, &labels, Some(2)).expect("Operation failed");
assert!(rdi > 1.0);
let overlapping = Array2::from_shape_vec(
(6, 2),
vec![1.0, 2.0, 1.5, 1.8, 3.0, 3.0, 3.0, 3.0, 4.0, 4.5, 5.0, 5.5],
)
.expect("Operation failed");
let labels = scirs2_core::ndarray::array![0, 0, 0, 1, 1, 1];
let rdi_overlapping =
relative_density_index(&overlapping, &labels, Some(2)).expect("Operation failed");
assert!(rdi > rdi_overlapping);
}
#[test]
fn test_density_based_cluster_validity() {
let well_separated = Array2::from_shape_vec(
(6, 2),
vec![
1.0, 2.0, 1.5, 1.8, 1.2, 2.2, 10.0, 12.0, 10.2, 11.8, 10.5, 12.2,
],
)
.expect("Operation failed");
let labels = scirs2_core::ndarray::array![0, 0, 0, 1, 1, 1];
let dbcv = density_based_cluster_validity(&well_separated, &labels, Some(2))
.expect("Operation failed");
assert!(dbcv > 0.0);
let poor_clustering = Array2::from_shape_vec(
(6, 2),
vec![1.0, 2.0, 8.0, 9.0, 1.2, 2.2, 8.0, 9.0, 1.0, 2.0, 8.0, 9.0],
)
.expect("Operation failed");
let bad_labels = scirs2_core::ndarray::array![0, 0, 0, 1, 1, 1];
let bad_dbcv = density_based_cluster_validity(&poor_clustering, &bad_labels, Some(2))
.expect("Operation failed");
assert!(dbcv > bad_dbcv);
}
}