use super::distance::{DistanceMetric, Euclidean};
use super::flat::DataRef;
use super::projindex::ProjIndex;
use super::util;
use crate::error::{Error, Result};
use std::collections::HashMap;
#[derive(Debug, Clone)]
pub struct Dbscan<D: DistanceMetric = Euclidean> {
epsilon: f32,
min_pts: usize,
metric: D,
}
pub const NOISE: usize = usize::MAX;
const UNCLASSIFIED: i32 = -2;
const NOISE_LABEL: i32 = -1;
impl Dbscan<Euclidean> {
pub fn new(epsilon: f32, min_pts: usize) -> Self {
assert!(epsilon > 0.0, "epsilon must be positive");
assert!(min_pts > 0, "min_pts must be at least 1");
Self {
epsilon,
min_pts,
metric: Euclidean,
}
}
}
impl<D: DistanceMetric> Dbscan<D> {
pub fn with_metric(epsilon: f32, min_pts: usize, metric: D) -> Self {
assert!(epsilon > 0.0, "epsilon must be positive");
assert!(min_pts > 0, "min_pts must be at least 1");
Self {
epsilon,
min_pts,
metric,
}
}
pub fn with_epsilon(mut self, epsilon: f32) -> Self {
assert!(epsilon > 0.0, "epsilon must be positive");
self.epsilon = epsilon;
self
}
pub fn with_min_pts(mut self, min_pts: usize) -> Self {
assert!(min_pts > 0, "min_pts must be at least 1");
self.min_pts = min_pts;
self
}
pub fn fit_predict_with_noise(
&self,
data: &(impl DataRef + ?Sized),
) -> Result<Vec<Option<usize>>> {
let labels = self.fit_predict(data)?;
Ok(labels
.into_iter()
.map(|l| if l == NOISE { None } else { Some(l) })
.collect())
}
pub fn is_noise(label: usize) -> bool {
label == NOISE
}
fn build_grid(data: &(impl DataRef + ?Sized), epsilon: f32) -> HashMap<Vec<i64>, Vec<usize>> {
let cell_size = epsilon;
let mut grid: HashMap<Vec<i64>, Vec<usize>> = HashMap::new();
for i in 0..data.n() {
let cell: Vec<i64> = data
.row(i)
.iter()
.map(|&x| (x / cell_size).floor() as i64)
.collect();
grid.entry(cell).or_default().push(i);
}
grid
}
fn region_query_grid(
&self,
data: &(impl DataRef + ?Sized),
point_idx: usize,
grid: &HashMap<Vec<i64>, Vec<usize>>,
) -> Vec<usize> {
let point = data.row(point_idx);
let d = point.len();
let cell_size = self.epsilon;
let center_cell: Vec<i64> = point
.iter()
.map(|&x| (x / cell_size).floor() as i64)
.collect();
let mut neighbors = Vec::new();
if d > 8 {
for j in 0..data.n() {
if j != point_idx && self.metric.distance(point, data.row(j)) <= self.epsilon {
neighbors.push(j);
}
}
return neighbors;
}
let mut cell_buf = [0i64; 8];
cell_buf[..d].copy_from_slice(¢er_cell);
let n_adjacent = 3i64.pow(d as u32);
for offset_idx in 0..n_adjacent {
let mut cell = cell_buf;
let mut idx = offset_idx;
for c in cell[..d].iter_mut() {
*c += (idx % 3) - 1;
idx /= 3;
}
if let Some(points) = grid.get(&cell[..d]) {
for &j in points {
if j != point_idx && self.metric.distance(point, data.row(j)) <= self.epsilon {
neighbors.push(j);
}
}
}
}
neighbors
}
#[allow(clippy::too_many_arguments)]
fn expand_cluster_grid(
&self,
data: &(impl DataRef + ?Sized),
point_idx: usize,
neighbors: &[usize],
labels: &mut [i32],
cluster_id: i32,
visited: &mut [bool],
grid: &HashMap<Vec<i64>, Vec<usize>>,
) {
labels[point_idx] = cluster_id;
let mut to_process: Vec<usize> = neighbors.to_vec();
while let Some(neighbor_idx) = to_process.pop() {
if labels[neighbor_idx] == UNCLASSIFIED || labels[neighbor_idx] == NOISE_LABEL {
labels[neighbor_idx] = cluster_id;
}
if visited[neighbor_idx] {
continue;
}
visited[neighbor_idx] = true;
let neighbor_neighbors = self.region_query_grid(data, neighbor_idx, grid);
if neighbor_neighbors.len() + 1 >= self.min_pts {
for nn in neighbor_neighbors {
if !visited[nn] {
to_process.push(nn);
}
}
}
}
}
fn region_query_precomputed(&self, dists: &[f32], n: usize, point_idx: usize) -> Vec<usize> {
let row = point_idx * n;
(0..n)
.filter(|&j| j != point_idx && dists[row + j] <= self.epsilon)
.collect()
}
#[allow(clippy::too_many_arguments)]
fn expand_cluster_precomputed(
&self,
dists: &[f32],
n: usize,
point_idx: usize,
neighbors: &[usize],
labels: &mut [i32],
cluster_id: i32,
visited: &mut [bool],
) {
labels[point_idx] = cluster_id;
let mut to_process: Vec<usize> = neighbors.to_vec();
while let Some(neighbor_idx) = to_process.pop() {
if labels[neighbor_idx] == UNCLASSIFIED || labels[neighbor_idx] == NOISE_LABEL {
labels[neighbor_idx] = cluster_id;
}
if visited[neighbor_idx] {
continue;
}
visited[neighbor_idx] = true;
let neighbor_neighbors = self.region_query_precomputed(dists, n, neighbor_idx);
if neighbor_neighbors.len() + 1 >= self.min_pts {
for nn in neighbor_neighbors {
if !visited[nn] {
to_process.push(nn);
}
}
}
}
}
}
impl Default for Dbscan<Euclidean> {
fn default() -> Self {
Self::new(0.5, 5)
}
}
impl<D: DistanceMetric> Dbscan<D> {
pub fn fit_predict(&self, data: &(impl DataRef + ?Sized)) -> Result<Vec<usize>> {
let n = data.n();
if n == 0 {
return Err(Error::EmptyInput);
}
let d = data.d();
if d == 0 {
return Err(Error::InvalidParameter {
name: "dimension",
message: "must be at least 1",
});
}
for i in 1..n {
if data.row(i).len() != d {
return Err(Error::DimensionMismatch {
expected: d,
found: data.row(i).len(),
});
}
}
util::validate_finite(data)?;
let mut labels = vec![UNCLASSIFIED; n];
let mut visited = vec![false; n];
let mut cluster_id: i32 = 0;
const MAX_MATRIX_BYTES: usize = 1024 * 1024 * 1024;
let use_precomputed = (n as u64) * (n as u64) * 4 <= MAX_MATRIX_BYTES as u64;
let use_grid = !use_precomputed && d <= 5;
if use_precomputed {
let dists = util::pairwise_distance_matrix(data, &self.metric);
for point_idx in 0..n {
if visited[point_idx] {
continue;
}
visited[point_idx] = true;
let neighbors = self.region_query_precomputed(&dists, n, point_idx);
if neighbors.len() + 1 < self.min_pts {
labels[point_idx] = NOISE_LABEL;
continue;
}
self.expand_cluster_precomputed(
&dists,
n,
point_idx,
&neighbors,
&mut labels,
cluster_id,
&mut visited,
);
cluster_id += 1;
}
} else if use_grid {
let grid = Self::build_grid(data, self.epsilon);
for point_idx in 0..n {
if visited[point_idx] {
continue;
}
visited[point_idx] = true;
let neighbors = self.region_query_grid(data, point_idx, &grid);
if neighbors.len() + 1 < self.min_pts {
labels[point_idx] = NOISE_LABEL;
continue;
}
self.expand_cluster_grid(
data,
point_idx,
&neighbors,
&mut labels,
cluster_id,
&mut visited,
&grid,
);
cluster_id += 1;
}
} else {
let proj = ProjIndex::new(data, &self.metric, 12);
for point_idx in 0..n {
if visited[point_idx] {
continue;
}
visited[point_idx] = true;
let mut neighbors = proj.range_query(data.row(point_idx), self.epsilon);
neighbors.retain(|&j| j != point_idx);
if neighbors.len() + 1 < self.min_pts {
labels[point_idx] = NOISE_LABEL;
continue;
}
labels[point_idx] = cluster_id;
let mut to_process = neighbors;
while let Some(neighbor_idx) = to_process.pop() {
if labels[neighbor_idx] == UNCLASSIFIED || labels[neighbor_idx] == NOISE_LABEL {
labels[neighbor_idx] = cluster_id;
}
if visited[neighbor_idx] {
continue;
}
visited[neighbor_idx] = true;
let mut nn = proj.range_query(data.row(neighbor_idx), self.epsilon);
nn.retain(|&j| j != neighbor_idx);
if nn.len() + 1 >= self.min_pts {
for idx in nn {
if !visited[idx] {
to_process.push(idx);
}
}
}
}
cluster_id += 1;
}
}
let mut out: Vec<usize> = Vec::with_capacity(n);
for l in labels {
if l >= 0 {
out.push(l as usize);
} else {
out.push(NOISE);
}
}
Ok(out)
}
}
#[cfg(test)]
#[allow(clippy::needless_range_loop)]
mod tests {
use super::*;
#[test]
fn test_dbscan_two_clusters() {
let data = vec![
vec![0.0, 0.0],
vec![0.1, 0.0],
vec![0.0, 0.1],
vec![0.1, 0.1],
vec![0.05, 0.05],
vec![5.0, 5.0],
vec![5.1, 5.0],
vec![5.0, 5.1],
vec![5.1, 5.1],
vec![5.05, 5.05],
];
let dbscan = Dbscan::new(0.3, 3);
let labels = dbscan.fit_predict(&data).unwrap();
assert_eq!(labels.len(), 10);
let cluster1 = labels[0];
for label in &labels[1..5] {
assert_eq!(*label, cluster1);
}
let cluster2 = labels[5];
for label in &labels[6..10] {
assert_eq!(*label, cluster2);
}
assert_ne!(cluster1, cluster2);
}
#[test]
fn test_dbscan_with_noise() {
let data = vec![
vec![0.0, 0.0],
vec![0.1, 0.0],
vec![0.0, 0.1],
vec![0.1, 0.1],
vec![100.0, 100.0],
vec![5.0, 5.0],
vec![5.1, 5.0],
vec![5.0, 5.1],
vec![5.1, 5.1],
];
let dbscan = Dbscan::new(0.3, 3);
let labels = dbscan.fit_predict_with_noise(&data).unwrap();
assert_eq!(labels.len(), 9);
assert!(labels[4].is_none());
for (i, label) in labels.iter().enumerate() {
if i != 4 {
assert!(label.is_some());
}
}
}
#[test]
fn test_dbscan_fit_predict_uses_noise_sentinel() {
let data = vec![
vec![0.0, 0.0],
vec![0.1, 0.0],
vec![0.0, 0.1],
vec![0.1, 0.1],
vec![100.0, 100.0],
vec![5.0, 5.0],
vec![5.1, 5.0],
vec![5.0, 5.1],
vec![5.1, 5.1],
];
let dbscan = Dbscan::new(0.3, 3);
let labels = dbscan.fit_predict(&data).unwrap();
assert_eq!(labels.len(), 9);
assert_eq!(labels[4], NOISE);
assert!(Dbscan::<Euclidean>::is_noise(labels[4]));
}
#[test]
fn test_dbscan_all_noise() {
let data = vec![
vec![0.0, 0.0],
vec![10.0, 0.0],
vec![0.0, 10.0],
vec![10.0, 10.0],
];
let dbscan = Dbscan::new(0.5, 3);
let labels = dbscan.fit_predict_with_noise(&data).unwrap();
for label in labels {
assert!(label.is_none());
}
}
#[test]
fn test_dbscan_all_one_cluster() {
let data = vec![
vec![0.0, 0.0],
vec![0.1, 0.0],
vec![0.0, 0.1],
vec![0.1, 0.1],
];
let dbscan = Dbscan::new(0.5, 2);
let labels = dbscan.fit_predict(&data).unwrap();
let cluster = labels[0];
for label in labels {
assert_eq!(label, cluster);
}
}
#[test]
fn test_dbscan_empty() {
let data: Vec<Vec<f32>> = vec![];
let dbscan = Dbscan::new(0.5, 3);
let result = dbscan.fit_predict(&data);
assert!(result.is_err());
}
#[test]
#[should_panic(expected = "epsilon must be positive")]
fn invalid_epsilon_zero() {
Dbscan::new(0.0, 3);
}
#[test]
#[should_panic(expected = "epsilon must be positive")]
fn invalid_epsilon_negative() {
Dbscan::new(-1.0, 3);
}
#[test]
#[should_panic(expected = "min_pts must be at least 1")]
fn invalid_min_pts_zero() {
Dbscan::new(0.5, 0);
}
#[test]
fn test_dbscan_chain() {
let data: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32 * 0.3, 0.0]).collect();
let dbscan = Dbscan::new(0.5, 2);
let labels = dbscan.fit_predict(&data).unwrap();
let cluster = labels[0];
for label in labels {
assert_eq!(label, cluster);
}
}
#[test]
fn nan_input_rejected() {
let data = vec![vec![0.0, f32::NAN], vec![1.0, 1.0], vec![2.0, 2.0]];
let result = Dbscan::new(0.5, 2).fit_predict(&data);
assert!(result.is_err());
}
#[test]
fn inf_input_rejected() {
let data = vec![vec![0.0, 0.0], vec![1.0, f32::INFINITY], vec![2.0, 2.0]];
let result = Dbscan::new(0.5, 2).fit_predict(&data);
assert!(result.is_err());
}
#[test]
fn scalar_data_d1() {
let data = vec![
vec![0.0],
vec![0.1],
vec![0.2],
vec![10.0],
vec![10.1],
vec![10.2],
];
let labels = Dbscan::new(0.5, 2).fit_predict(&data).unwrap();
assert_eq!(labels[0], labels[1]);
assert_ne!(labels[0], labels[3]);
}
#[test]
fn test_dbscan_with_custom_metric() {
use crate::cluster::distance::SquaredEuclidean;
let data = vec![
vec![0.0, 0.0],
vec![0.1, 0.0],
vec![0.0, 0.1],
vec![0.1, 0.1],
vec![5.0, 5.0],
vec![5.1, 5.0],
vec![5.0, 5.1],
vec![5.1, 5.1],
];
let dbscan = Dbscan::with_metric(0.09, 3, SquaredEuclidean);
let labels = dbscan.fit_predict(&data).unwrap();
assert_eq!(labels.len(), 8);
assert_eq!(labels[0], labels[1]);
assert_ne!(labels[0], labels[4]);
}
#[test]
fn labels_range_property() {
let data = vec![
vec![0.0, 0.0],
vec![0.1, 0.1],
vec![0.2, 0.2],
vec![10.0, 10.0],
vec![10.1, 10.1],
vec![10.2, 10.2],
vec![50.0, 50.0], ];
let labels = Dbscan::new(0.5, 2).fit_predict(&data).unwrap();
let non_noise: Vec<usize> = labels.iter().copied().filter(|&l| l != NOISE).collect();
if !non_noise.is_empty() {
let max_label = *non_noise.iter().max().unwrap();
for l in 0..=max_label {
assert!(
non_noise.contains(&l),
"label {l} missing from range 0..={max_label}"
);
}
}
}
#[test]
fn single_point() {
let data = vec![vec![1.0, 2.0]];
let labels = Dbscan::new(0.5, 2).fit_predict(&data).unwrap();
assert_eq!(labels.len(), 1);
assert_eq!(labels[0], NOISE);
}
#[test]
fn high_dim_few_points() {
let data = vec![
vec![0.0; 100],
{
let mut v = vec![0.0; 100];
v[0] = 0.01;
v
},
vec![10.0; 100],
];
let labels = Dbscan::new(1.0, 2).fit_predict(&data).unwrap();
assert_eq!(labels.len(), 3);
assert_eq!(labels[0], labels[1]); }
#[test]
fn eps_infinity_one_cluster() {
let data = vec![vec![0.0, 0.0], vec![100.0, 100.0], vec![200.0, 200.0]];
let labels = Dbscan::new(1000.0, 2).fit_predict(&data).unwrap();
assert_eq!(labels[0], labels[1]);
assert_eq!(labels[1], labels[2]);
assert_ne!(labels[0], NOISE);
}
#[test]
fn min_pts_1_no_noise() {
let data = vec![vec![0.0, 0.0], vec![100.0, 100.0], vec![200.0, 200.0]];
let labels = Dbscan::new(0.01, 1).fit_predict(&data).unwrap();
for &l in &labels {
assert_ne!(l, NOISE, "min_pts=1 should produce no noise");
}
}
#[test]
fn eps_boundary_inclusive() {
let data = vec![vec![0.0, 0.0], vec![1.0, 0.0]];
let labels = Dbscan::new(1.0, 2).fit_predict(&data).unwrap();
assert_eq!(
labels[0], labels[1],
"points at exactly eps should be neighbors"
);
assert_ne!(labels[0], NOISE);
}
#[test]
fn duplicate_points_same_cluster() {
let data = vec![
vec![1.0, 1.0],
vec![1.0, 1.0],
vec![1.0, 1.0],
vec![10.0, 10.0],
vec![10.0, 10.0],
vec![10.0, 10.0],
];
let labels = Dbscan::new(0.5, 2).fit_predict(&data).unwrap();
assert_eq!(labels[0], labels[1]);
assert_eq!(labels[0], labels[2]);
assert_eq!(labels[3], labels[4]);
assert_eq!(labels[3], labels[5]);
}
#[test]
fn chain_density_connected() {
let data: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32 * 0.3, 0.0]).collect();
let labels = Dbscan::new(0.5, 2).fit_predict(&data).unwrap();
let first = labels[0];
for (i, &l) in labels.iter().enumerate() {
assert_eq!(l, first, "point {i} should be in same cluster as point 0");
}
}
}
#[cfg(test)]
mod proptests {
use super::*;
use proptest::prelude::*;
fn arb_data(max_n: usize, d: usize) -> impl Strategy<Value = Vec<Vec<f32>>> {
proptest::collection::vec(proptest::collection::vec(-10.0f32..10.0, d..=d), 2..=max_n)
}
proptest! {
#[test]
fn labels_valid(data in arb_data(30, 3)) {
let labels = Dbscan::new(1.0, 2).fit_predict(&data).unwrap();
prop_assert_eq!(labels.len(), data.len());
let max_cluster = labels.iter().copied().filter(|&l| l != NOISE).max();
if let Some(max) = max_cluster {
for &l in &labels {
prop_assert!(l == NOISE || l <= max);
}
}
}
#[test]
fn deterministic(data in arb_data(20, 2)) {
let l1 = Dbscan::new(0.5, 2).fit_predict(&data).unwrap();
let l2 = Dbscan::new(0.5, 2).fit_predict(&data).unwrap();
prop_assert_eq!(&l1, &l2, "DBSCAN must be deterministic");
}
#[test]
fn duplicates_same_cluster(
base in proptest::collection::vec(-10.0f32..10.0, 3..=3),
) {
let data = vec![base.clone(), base.clone(), vec![100.0, 100.0, 100.0]];
let labels = Dbscan::new(0.5, 2).fit_predict(&data).unwrap();
prop_assert_eq!(labels[0], labels[1], "duplicates must be in same cluster");
}
#[test]
fn all_noise_when_min_samples_gt_n(data in arb_data(10, 2)) {
let n = data.len();
let labels = Dbscan::new(1.0, n + 1).fit_predict(&data).unwrap();
for (i, &l) in labels.iter().enumerate() {
prop_assert_eq!(l, NOISE, "point {} should be NOISE when min_samples > n", i);
}
}
#[test]
fn monotone_eps(data in arb_data(15, 2)) {
let eps1 = 0.5;
let eps2 = 2.0;
let labels1 = Dbscan::new(eps1, 2).fit_predict(&data).unwrap();
let labels2 = Dbscan::new(eps2, 2).fit_predict(&data).unwrap();
let noise1 = labels1.iter().filter(|&&l| l == NOISE).count();
let noise2 = labels2.iter().filter(|&&l| l == NOISE).count();
prop_assert!(
noise2 <= noise1,
"larger eps should have <= noise: noise(eps={})={} > noise(eps={})={}",
eps2, noise2, eps1, noise1
);
}
#[test]
fn transitivity(data in arb_data(15, 3)) {
let labels = Dbscan::new(1.0, 2).fit_predict(&data).unwrap();
let n = labels.len();
for a in 0..n {
if labels[a] == NOISE { continue; }
for b in (a + 1)..n {
if labels[b] != labels[a] { continue; }
for c in (b + 1)..n {
if labels[c] == labels[b] {
prop_assert_eq!(
labels[a], labels[c],
"transitivity violated: labels[{}]={}, labels[{}]={}, labels[{}]={}",
a, labels[a], b, labels[b], c, labels[c]
);
}
}
}
}
}
#[test]
fn all_identical_points(
point in proptest::collection::vec(-10.0f32..10.0, 2..=2),
n in 3usize..10,
) {
let data: Vec<Vec<f32>> = vec![point; n];
let labels = Dbscan::new(0.1, n).fit_predict(&data).unwrap();
let first = labels[0];
prop_assert_ne!(first, NOISE, "identical points should not be noise");
for (i, &l) in labels.iter().enumerate() {
prop_assert_eq!(l, first, "point {} should match point 0", i);
}
}
}
}