use crate::error::{ClusteringError, Result};
#[derive(Debug, Clone)]
pub struct DensityPeaksAdvResult {
pub labels: Vec<i64>,
pub centers: Vec<usize>,
pub rho: Vec<f64>,
pub delta: Vec<f64>,
pub n_clusters: usize,
}
#[derive(Debug, Clone, PartialEq)]
pub enum KernelType {
Gaussian,
Cutoff,
}
pub struct DensityPeaksAdv {
pub d_c: Option<f64>,
pub kernel: KernelType,
pub n_clusters: Option<usize>,
pub rho_threshold: f64,
pub delta_threshold: f64,
}
impl DensityPeaksAdv {
pub fn new() -> Self {
Self {
d_c: None,
kernel: KernelType::Gaussian,
n_clusters: None,
rho_threshold: 0.0,
delta_threshold: 0.0,
}
}
pub fn with_d_c(mut self, d_c: f64) -> Self {
self.d_c = Some(d_c);
self
}
pub fn with_kernel(mut self, kernel: KernelType) -> Self {
self.kernel = kernel;
self
}
pub fn with_n_clusters(mut self, k: usize) -> Self {
self.n_clusters = Some(k);
self
}
pub fn with_thresholds(mut self, rho_t: f64, delta_t: f64) -> Self {
self.rho_threshold = rho_t;
self.delta_threshold = delta_t;
self
}
pub fn fit(&self, data: &[Vec<f64>]) -> Result<DensityPeaksAdvResult> {
let n = data.len();
if n == 0 {
return Err(ClusteringError::InvalidInput(
"input data must not be empty".to_string(),
));
}
let dim = data[0].len();
for (i, point) in data.iter().enumerate() {
if point.len() != dim {
return Err(ClusteringError::InvalidInput(format!(
"point {} has dimension {} but expected {}",
i,
point.len(),
dim
)));
}
}
let dists = compute_pairwise_distances(data);
let d_c = match self.d_c {
Some(v) => {
if v <= 0.0 {
return Err(ClusteringError::InvalidInput(
"d_c must be positive".to_string(),
));
}
v
}
None => auto_select_dc(&dists, n),
};
let rho = compute_rho(&dists, n, d_c, &self.kernel);
let mut order: Vec<usize> = (0..n).collect();
order.sort_by(|&a, &b| {
rho[b]
.partial_cmp(&rho[a])
.unwrap_or(std::cmp::Ordering::Equal)
});
let (delta, nearest_higher) = compute_delta(&dists, n, &rho, &order);
let centers = select_centers(n, &rho, &delta, self.n_clusters, self.rho_threshold, self.delta_threshold)?;
let labels = propagate_labels(n, ¢ers, &nearest_higher, &order);
let n_clusters = centers.len();
Ok(DensityPeaksAdvResult {
labels,
centers,
rho,
delta,
n_clusters,
})
}
}
impl Default for DensityPeaksAdv {
fn default() -> Self {
Self::new()
}
}
fn compute_pairwise_distances(data: &[Vec<f64>]) -> Vec<f64> {
let n = data.len();
let mut dists = vec![0.0f64; n * n];
for i in 0..n {
for j in (i + 1)..n {
let d: f64 = data[i]
.iter()
.zip(data[j].iter())
.map(|(a, b)| (a - b).powi(2))
.sum::<f64>()
.sqrt();
dists[i * n + j] = d;
dists[j * n + i] = d;
}
}
dists
}
fn auto_select_dc(dists: &[f64], n: usize) -> f64 {
let mut upper: Vec<f64> = Vec::with_capacity(n * (n - 1) / 2);
for i in 0..n {
for j in (i + 1)..n {
upper.push(dists[i * n + j]);
}
}
upper.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let idx = ((upper.len() as f64) * 0.02) as usize;
upper.get(idx).copied().unwrap_or(1.0).max(1e-10)
}
fn compute_rho(dists: &[f64], n: usize, d_c: f64, kernel: &KernelType) -> Vec<f64> {
(0..n)
.map(|i| match kernel {
KernelType::Gaussian => {
let two_dc2 = 2.0 * d_c * d_c;
(0..n)
.filter(|&j| j != i)
.map(|j| {
let d = dists[i * n + j];
(-(d * d) / two_dc2).exp()
})
.sum()
}
KernelType::Cutoff => {
(0..n)
.filter(|&j| j != i && dists[i * n + j] < d_c)
.count() as f64
}
})
.collect()
}
fn compute_delta(
dists: &[f64],
n: usize,
rho: &[f64],
order: &[usize],
) -> (Vec<f64>, Vec<usize>) {
let mut delta = vec![f64::INFINITY; n];
let mut nearest_higher = vec![0usize; n];
for (rank, &i) in order.iter().enumerate() {
if rank == 0 {
let max_d = (0..n)
.filter(|&j| j != i)
.map(|j| dists[i * n + j])
.fold(f64::NEG_INFINITY, f64::max);
delta[i] = if max_d.is_finite() { max_d } else { 0.0 };
nearest_higher[i] = i;
} else {
for &j in order[..rank].iter() {
let d = dists[i * n + j];
if d < delta[i] {
delta[i] = d;
nearest_higher[i] = j;
}
}
if delta[i].is_infinite() {
delta[i] = 0.0;
}
}
}
(delta, nearest_higher)
}
fn select_centers(
n: usize,
rho: &[f64],
delta: &[f64],
n_clusters: Option<usize>,
rho_threshold: f64,
delta_threshold: f64,
) -> Result<Vec<usize>> {
match n_clusters {
Some(k) => {
if k > n {
return Err(ClusteringError::InvalidInput(format!(
"n_clusters ({}) exceeds number of data points ({})",
k, n
)));
}
let mut gamma: Vec<(f64, usize)> = (0..n).map(|i| (rho[i] * delta[i], i)).collect();
gamma.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
Ok(gamma[..k].iter().map(|(_, idx)| *idx).collect())
}
None => {
let centers: Vec<usize> = (0..n)
.filter(|&i| rho[i] > rho_threshold && delta[i] > delta_threshold)
.collect();
Ok(centers)
}
}
}
fn propagate_labels(
n: usize,
centers: &[usize],
nearest_higher: &[usize],
order: &[usize],
) -> Vec<i64> {
let mut labels = vec![-1i64; n];
for (cluster_id, ¢er) in centers.iter().enumerate() {
labels[center] = cluster_id as i64;
}
for &i in order.iter() {
if labels[i] == -1 {
labels[i] = labels[nearest_higher[i]];
}
}
labels
}
#[cfg(test)]
mod tests {
use super::*;
fn make_two_gaussians() -> Vec<Vec<f64>> {
vec![
vec![0.0, 0.0],
vec![0.2, 0.1],
vec![-0.1, 0.2],
vec![0.1, -0.1],
vec![0.3, 0.0],
vec![10.0, 10.0],
vec![10.1, 9.9],
vec![9.9, 10.1],
vec![10.2, 10.0],
vec![10.0, 9.8],
]
}
#[test]
fn test_density_peaks_two_clusters_n_clusters() {
let data = make_two_gaussians();
let result = DensityPeaksAdv::new()
.with_n_clusters(2)
.fit(&data)
.expect("fit should succeed");
assert_eq!(result.n_clusters, 2);
assert_eq!(result.labels.len(), 10);
for &lbl in &result.labels {
assert!(lbl >= 0, "label should be non-negative");
}
}
#[test]
fn test_density_peaks_gaussian_kernel() {
let data = make_two_gaussians();
let result = DensityPeaksAdv::new()
.with_kernel(KernelType::Gaussian)
.with_n_clusters(2)
.fit(&data)
.expect("Gaussian kernel fit should succeed");
assert_eq!(result.n_clusters, 2);
assert_eq!(result.rho.len(), 10);
assert_eq!(result.delta.len(), 10);
}
#[test]
fn test_density_peaks_cutoff_kernel() {
let data = make_two_gaussians();
let result = DensityPeaksAdv::new()
.with_kernel(KernelType::Cutoff)
.with_n_clusters(2)
.fit(&data)
.expect("Cutoff kernel fit should succeed");
assert_eq!(result.n_clusters, 2);
}
#[test]
fn test_density_peaks_explicit_dc() {
let data = make_two_gaussians();
let result = DensityPeaksAdv::new()
.with_d_c(0.5)
.with_n_clusters(2)
.fit(&data)
.expect("fit with explicit d_c should succeed");
assert_eq!(result.n_clusters, 2);
}
#[test]
fn test_density_peaks_threshold_mode() {
let data = make_two_gaussians();
let result = DensityPeaksAdv::new()
.with_thresholds(0.0, 0.0)
.fit(&data)
.expect("threshold mode fit should succeed");
assert!(result.n_clusters > 0);
}
#[test]
fn test_density_peaks_empty_input() {
let data: Vec<Vec<f64>> = vec![];
let err = DensityPeaksAdv::new().fit(&data);
assert!(err.is_err());
}
#[test]
fn test_density_peaks_invalid_n_clusters() {
let data = make_two_gaussians();
let err = DensityPeaksAdv::new().with_n_clusters(100).fit(&data);
assert!(err.is_err());
}
#[test]
fn test_auto_select_dc() {
let data = make_two_gaussians();
let dists = compute_pairwise_distances(&data);
let dc = auto_select_dc(&dists, data.len());
assert!(dc > 0.0, "d_c must be positive");
}
#[test]
fn test_same_cluster_labeling() {
let data = make_two_gaussians();
let result = DensityPeaksAdv::new()
.with_n_clusters(2)
.fit(&data)
.expect("fit should succeed");
let label_a = result.labels[0];
for i in 1..5 {
assert_eq!(result.labels[i], label_a, "cluster A consistency");
}
let label_b = result.labels[5];
for i in 6..10 {
assert_eq!(result.labels[i], label_b, "cluster B consistency");
}
assert_ne!(label_a, label_b, "two clusters must differ");
}
}