use std::fmt::Debug;
use std::marker::PhantomData;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName};
use crate::api::{Predictor, UnsupervisedEstimator};
use crate::error::Failed;
use crate::linalg::basic::arrays::{Array1, Array2};
use crate::metrics::distance::euclidian::Euclidian;
use crate::metrics::distance::{Distance, Distances};
use crate::numbers::basenum::Number;
use crate::tree::decision_tree_classifier::which_max;
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug)]
pub struct DBSCAN<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>> {
cluster_labels: Vec<i16>,
num_classes: usize,
knn_algorithm: KNNAlgorithm<TX, D>,
eps: f64,
_phantom_ty: PhantomData<TY>,
_phantom_x: PhantomData<X>,
_phantom_y: PhantomData<Y>,
}
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone)]
pub struct DBSCANParameters<T: Number, D: Distance<Vec<T>>> {
#[cfg_attr(feature = "serde", serde(default))]
pub distance: D,
#[cfg_attr(feature = "serde", serde(default))]
pub min_samples: usize,
#[cfg_attr(feature = "serde", serde(default))]
pub eps: f64,
#[cfg_attr(feature = "serde", serde(default))]
pub algorithm: KNNAlgorithmName,
#[cfg_attr(feature = "serde", serde(default))]
_phantom_t: PhantomData<T>,
}
impl<T: Number, D: Distance<Vec<T>>> DBSCANParameters<T, D> {
pub fn with_distance<DD: Distance<Vec<T>>>(self, distance: DD) -> DBSCANParameters<T, DD> {
DBSCANParameters {
distance,
min_samples: self.min_samples,
eps: self.eps,
algorithm: self.algorithm,
_phantom_t: PhantomData,
}
}
pub fn with_min_samples(mut self, min_samples: usize) -> Self {
self.min_samples = min_samples;
self
}
pub fn with_eps(mut self, eps: f64) -> Self {
self.eps = eps;
self
}
pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self {
self.algorithm = algorithm;
self
}
}
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone)]
pub struct DBSCANSearchParameters<T: Number, D: Distance<Vec<T>>> {
#[cfg_attr(feature = "serde", serde(default))]
pub distance: Vec<D>,
#[cfg_attr(feature = "serde", serde(default))]
pub min_samples: Vec<usize>,
#[cfg_attr(feature = "serde", serde(default))]
pub eps: Vec<f64>,
#[cfg_attr(feature = "serde", serde(default))]
pub algorithm: Vec<KNNAlgorithmName>,
_phantom_t: PhantomData<T>,
}
pub struct DBSCANSearchParametersIterator<T: Number, D: Distance<Vec<T>>> {
dbscan_search_parameters: DBSCANSearchParameters<T, D>,
current_distance: usize,
current_min_samples: usize,
current_eps: usize,
current_algorithm: usize,
}
impl<T: Number, D: Distance<Vec<T>>> IntoIterator for DBSCANSearchParameters<T, D> {
type Item = DBSCANParameters<T, D>;
type IntoIter = DBSCANSearchParametersIterator<T, D>;
fn into_iter(self) -> Self::IntoIter {
DBSCANSearchParametersIterator {
dbscan_search_parameters: self,
current_distance: 0,
current_min_samples: 0,
current_eps: 0,
current_algorithm: 0,
}
}
}
impl<T: Number, D: Distance<Vec<T>>> Iterator for DBSCANSearchParametersIterator<T, D> {
type Item = DBSCANParameters<T, D>;
fn next(&mut self) -> Option<Self::Item> {
if self.current_distance == self.dbscan_search_parameters.distance.len()
&& self.current_min_samples == self.dbscan_search_parameters.min_samples.len()
&& self.current_eps == self.dbscan_search_parameters.eps.len()
&& self.current_algorithm == self.dbscan_search_parameters.algorithm.len()
{
return None;
}
let next = DBSCANParameters {
distance: self.dbscan_search_parameters.distance[self.current_distance].clone(),
min_samples: self.dbscan_search_parameters.min_samples[self.current_min_samples],
eps: self.dbscan_search_parameters.eps[self.current_eps],
algorithm: self.dbscan_search_parameters.algorithm[self.current_algorithm].clone(),
_phantom_t: PhantomData,
};
if self.current_distance + 1 < self.dbscan_search_parameters.distance.len() {
self.current_distance += 1;
} else if self.current_min_samples + 1 < self.dbscan_search_parameters.min_samples.len() {
self.current_distance = 0;
self.current_min_samples += 1;
} else if self.current_eps + 1 < self.dbscan_search_parameters.eps.len() {
self.current_distance = 0;
self.current_min_samples = 0;
self.current_eps += 1;
} else if self.current_algorithm + 1 < self.dbscan_search_parameters.algorithm.len() {
self.current_distance = 0;
self.current_min_samples = 0;
self.current_eps = 0;
self.current_algorithm += 1;
} else {
self.current_distance += 1;
self.current_min_samples += 1;
self.current_eps += 1;
self.current_algorithm += 1;
}
Some(next)
}
}
impl<T: Number> Default for DBSCANSearchParameters<T, Euclidian<T>> {
fn default() -> Self {
let default_params = DBSCANParameters::default();
DBSCANSearchParameters {
distance: vec![default_params.distance],
min_samples: vec![default_params.min_samples],
eps: vec![default_params.eps],
algorithm: vec![default_params.algorithm],
_phantom_t: PhantomData,
}
}
}
impl<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>> PartialEq
for DBSCAN<TX, TY, X, Y, D>
{
fn eq(&self, other: &Self) -> bool {
self.cluster_labels.len() == other.cluster_labels.len()
&& self.num_classes == other.num_classes
&& self.eps == other.eps
&& self.cluster_labels == other.cluster_labels
}
}
impl<T: Number> Default for DBSCANParameters<T, Euclidian<T>> {
fn default() -> Self {
DBSCANParameters {
distance: Distances::euclidian(),
min_samples: 5,
eps: 0.5f64,
algorithm: KNNAlgorithmName::default(),
_phantom_t: PhantomData,
}
}
}
impl<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>>
UnsupervisedEstimator<X, DBSCANParameters<TX, D>> for DBSCAN<TX, TY, X, Y, D>
{
fn fit(x: &X, parameters: DBSCANParameters<TX, D>) -> Result<Self, Failed> {
DBSCAN::fit(x, parameters)
}
}
impl<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>> Predictor<X, Y>
for DBSCAN<TX, TY, X, Y, D>
{
fn predict(&self, x: &X) -> Result<Y, Failed> {
self.predict(x)
}
}
impl<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>>
DBSCAN<TX, TY, X, Y, D>
{
pub fn fit(
x: &X,
parameters: DBSCANParameters<TX, D>,
) -> Result<DBSCAN<TX, TY, X, Y, D>, Failed> {
if parameters.min_samples < 1 {
return Err(Failed::fit("Invalid minPts"));
}
if parameters.eps <= 0f64 {
return Err(Failed::fit("Invalid radius: "));
}
let mut k = 0;
let queued = -2;
let outlier = -1;
let undefined = -3;
let n = x.shape().0;
let mut y = vec![undefined; n];
let algo = parameters.algorithm.fit(
x.row_iter()
.map(|row| row.iterator(0).cloned().collect())
.collect(),
parameters.distance,
)?;
let mut row = vec![TX::zero(); x.shape().1];
for (i, e) in x.row_iter().enumerate() {
if y[i] == undefined {
e.iterator(0).zip(row.iter_mut()).for_each(|(&x, r)| *r = x);
let mut neighbors = algo.find_radius(&row, parameters.eps)?;
if neighbors.len() < parameters.min_samples {
y[i] = outlier;
} else {
y[i] = k;
for j in 0..neighbors.len() {
if y[neighbors[j].0] == undefined {
y[neighbors[j].0] = queued;
}
}
while !neighbors.is_empty() {
let neighbor = neighbors.pop().unwrap();
let index = neighbor.0;
if y[index] == outlier {
y[index] = k;
}
if y[index] == undefined || y[index] == queued {
y[index] = k;
let secondary_neighbors =
algo.find_radius(neighbor.2, parameters.eps)?;
if secondary_neighbors.len() >= parameters.min_samples {
for j in 0..secondary_neighbors.len() {
let label = y[secondary_neighbors[j].0];
if label == undefined {
y[secondary_neighbors[j].0] = queued;
}
if label == undefined || label == outlier {
neighbors.push(secondary_neighbors[j]);
}
}
}
}
}
k += 1;
}
}
}
Ok(DBSCAN {
cluster_labels: y,
num_classes: k as usize,
knn_algorithm: algo,
eps: parameters.eps,
_phantom_ty: PhantomData,
_phantom_x: PhantomData,
_phantom_y: PhantomData,
})
}
pub fn predict(&self, x: &X) -> Result<Y, Failed> {
let (n, _) = x.shape();
let mut result = Y::zeros(n);
let mut row = vec![TX::zero(); x.shape().1];
for i in 0..n {
x.get_row(i)
.iterator(0)
.zip(row.iter_mut())
.for_each(|(&x, r)| *r = x);
let neighbors = self.knn_algorithm.find_radius(&row, self.eps)?;
let mut label = vec![0usize; self.num_classes + 1];
for neighbor in neighbors {
let yi = self.cluster_labels[neighbor.0];
if yi < 0 {
label[self.num_classes] += 1;
} else {
label[yi as usize] += 1;
}
}
let class = which_max(&label);
if class != self.num_classes {
result.set(i, TY::from(class + 1).unwrap());
} else {
result.set(i, TY::zero());
}
}
Ok(result)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::linalg::basic::matrix::DenseMatrix;
#[cfg(feature = "serde")]
use crate::metrics::distance::euclidian::Euclidian;
#[test]
fn search_parameters() {
let parameters: DBSCANSearchParameters<f64, Euclidian<f64>> = DBSCANSearchParameters {
min_samples: vec![10, 100],
eps: vec![1., 2.],
..Default::default()
};
let mut iter = parameters.into_iter();
let next = iter.next().unwrap();
assert_eq!(next.min_samples, 10);
assert_eq!(next.eps, 1.);
let next = iter.next().unwrap();
assert_eq!(next.min_samples, 100);
assert_eq!(next.eps, 1.);
let next = iter.next().unwrap();
assert_eq!(next.min_samples, 10);
assert_eq!(next.eps, 2.);
let next = iter.next().unwrap();
assert_eq!(next.min_samples, 100);
assert_eq!(next.eps, 2.);
assert!(iter.next().is_none());
}
#[cfg_attr(
all(target_arch = "wasm32", not(target_os = "wasi")),
wasm_bindgen_test::wasm_bindgen_test
)]
#[test]
fn fit_predict_dbscan() {
let x = DenseMatrix::from_2d_array(&[
&[1.0, 2.0],
&[1.1, 2.1],
&[0.9, 1.9],
&[1.2, 2.2],
&[0.8, 1.8],
&[2.0, 1.0],
&[2.1, 1.1],
&[1.9, 0.9],
&[2.2, 1.2],
&[1.8, 0.8],
&[3.0, 5.0],
]);
let expected_labels = vec![1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 0];
let dbscan = DBSCAN::fit(
&x,
DBSCANParameters::default()
.with_eps(0.5)
.with_min_samples(2),
)
.unwrap();
let predicted_labels: Vec<i32> = dbscan.predict(&x).unwrap();
assert_eq!(expected_labels, predicted_labels);
}
#[cfg_attr(
all(target_arch = "wasm32", not(target_os = "wasi")),
wasm_bindgen_test::wasm_bindgen_test
)]
#[test]
#[cfg(feature = "serde")]
fn serde() {
let x = DenseMatrix::from_2d_array(&[
&[5.1, 3.5, 1.4, 0.2],
&[4.9, 3.0, 1.4, 0.2],
&[4.7, 3.2, 1.3, 0.2],
&[4.6, 3.1, 1.5, 0.2],
&[5.0, 3.6, 1.4, 0.2],
&[5.4, 3.9, 1.7, 0.4],
&[4.6, 3.4, 1.4, 0.3],
&[5.0, 3.4, 1.5, 0.2],
&[4.4, 2.9, 1.4, 0.2],
&[4.9, 3.1, 1.5, 0.1],
&[7.0, 3.2, 4.7, 1.4],
&[6.4, 3.2, 4.5, 1.5],
&[6.9, 3.1, 4.9, 1.5],
&[5.5, 2.3, 4.0, 1.3],
&[6.5, 2.8, 4.6, 1.5],
&[5.7, 2.8, 4.5, 1.3],
&[6.3, 3.3, 4.7, 1.6],
&[4.9, 2.4, 3.3, 1.0],
&[6.6, 2.9, 4.6, 1.3],
&[5.2, 2.7, 3.9, 1.4],
]);
let dbscan = DBSCAN::fit(&x, Default::default()).unwrap();
let deserialized_dbscan: DBSCAN<f32, f32, DenseMatrix<f32>, Vec<f32>, Euclidian<f32>> =
serde_json::from_str(&serde_json::to_string(&dbscan).unwrap()).unwrap();
assert_eq!(dbscan, deserialized_dbscan);
}
#[cfg(feature = "datasets")]
#[test]
fn from_vec() {
use crate::dataset::generator;
let blobs = generator::make_blobs(100, 2, 3);
let x: DenseMatrix<f32> = DenseMatrix::from_iterator(blobs.data.into_iter(), 100, 2, 0);
let labels: Vec<i32> = DBSCAN::fit(&x, DBSCANParameters::default().with_eps(3.0))
.and_then(|dbscan| dbscan.predict(&x))
.unwrap();
println!("{labels:?}");
}
}