pub mod hdbscan;
pub mod optics;
use scirs2_core::ndarray::{Array1, Array2, ArrayView2};
use scirs2_core::numeric::{Float, FromPrimitive};
mod distance {
use scirs2_core::numeric::Float;
pub fn euclidean<F: Float>(a: &[F], b: &[F]) -> F {
let mut sum = F::zero();
for (a_i, b_i) in a.iter().zip(b.iter()) {
let diff = *a_i - *b_i;
sum = sum + diff * diff;
}
sum.sqrt()
}
pub fn manhattan<F: Float>(a: &[F], b: &[F]) -> F {
let mut sum = F::zero();
for (a_i, b_i) in a.iter().zip(b.iter()) {
sum = sum + (*a_i - *b_i).abs();
}
sum
}
pub fn chebyshev<F: Float>(a: &[F], b: &[F]) -> F {
let mut max = F::zero();
for (a_i, b_i) in a.iter().zip(b.iter()) {
let abs_diff = (*a_i - *b_i).abs();
if abs_diff > max {
max = abs_diff;
}
}
max
}
pub fn minkowski<F: Float>(a: &[F], b: &[F], p: F) -> F {
let mut sum = F::zero();
for (a_i, b_i) in a.iter().zip(b.iter()) {
sum = sum + ((*a_i - *b_i).abs()).powf(p);
}
sum.powf(F::one() / p)
}
}
use std::collections::HashSet;
use std::fmt::Debug;
use crate::error::{ClusteringError, Result};
pub mod labels {
pub const NOISE: i32 = -1;
pub const UNDEFINED: i32 = -2;
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DistanceMetric {
Euclidean,
Manhattan,
Chebyshev,
Minkowski,
}
#[allow(dead_code)]
pub fn dbscan<F: Float + FromPrimitive + Debug + PartialOrd>(
data: ArrayView2<F>,
eps: F,
min_samples: usize,
metric: Option<DistanceMetric>,
) -> Result<Array1<i32>> {
let n_samples = data.shape()[0];
if n_samples == 0 {
return Err(ClusteringError::InvalidInput("Empty input data".into()));
}
if eps <= F::zero() {
return Err(ClusteringError::InvalidInput("eps must be positive".into()));
}
if min_samples < 1 {
return Err(ClusteringError::InvalidInput(
"min_samples must be at least 1".into(),
));
}
let mut labels = vec![labels::UNDEFINED; n_samples];
let mut cluster_label: i32 = 0;
let mut distances = Array2::<F>::zeros((n_samples, n_samples));
for i in 0..n_samples {
for j in (i + 1)..n_samples {
let point1 = data.row(i).to_vec();
let point2 = data.row(j).to_vec();
let dist = match metric.unwrap_or(DistanceMetric::Euclidean) {
DistanceMetric::Euclidean => distance::euclidean(&point1, &point2),
DistanceMetric::Manhattan => distance::manhattan(&point1, &point2),
DistanceMetric::Chebyshev => distance::chebyshev(&point1, &point2),
DistanceMetric::Minkowski => distance::minkowski(
&point1,
&point2,
F::from(3.0).expect("Failed to convert constant to float"),
),
};
distances[[i, j]] = dist;
distances[[j, i]] = dist; }
}
let mut neighborhoods: Vec<Vec<usize>> = Vec::with_capacity(n_samples);
for i in 0..n_samples {
let mut neighbors = Vec::new();
for j in 0..n_samples {
if i != j && distances[[i, j]] <= eps {
neighbors.push(j);
}
}
neighborhoods.push(neighbors);
}
for point_idx in 0..n_samples {
if labels[point_idx] != labels::UNDEFINED {
continue;
}
let neighbors = &neighborhoods[point_idx];
if neighbors.len() + 1 < min_samples {
labels[point_idx] = labels::NOISE;
continue;
}
labels[point_idx] = cluster_label;
let mut queue: Vec<usize> = neighbors.clone();
let mut processed = HashSet::new();
processed.insert(point_idx);
let mut idx = 0;
while idx < queue.len() {
let current = queue[idx];
idx += 1;
if processed.contains(¤t) {
continue;
}
processed.insert(current);
if labels[current] == labels::NOISE {
labels[current] = cluster_label;
continue;
}
if labels[current] != labels::UNDEFINED {
continue;
}
labels[current] = cluster_label;
let current_neighbors = &neighborhoods[current];
if current_neighbors.len() + 1 >= min_samples {
for &neighbor in current_neighbors {
if !processed.contains(&neighbor) {
queue.push(neighbor);
}
}
}
}
cluster_label += 1;
}
Ok(Array1::from(labels))
}
#[allow(dead_code)]
pub fn optics<F: Float + FromPrimitive + Debug + PartialOrd>(
data: ArrayView2<F>,
min_samples: usize,
max_eps: Option<F>,
metric: Option<DistanceMetric>,
) -> Result<Array1<i32>> {
let result = optics::optics(data, min_samples, max_eps, metric)?;
let default_eps = match max_eps {
Some(_eps) => _eps.to_f64().unwrap_or(0.5) / 10.0,
None => 0.5,
};
Ok(optics::extract_dbscan_clustering(&result, default_eps))
}