use std::fmt::Debug;
use std::iter::Sum;
use serde::{Deserialize, Serialize};
use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName};
use crate::api::{Predictor, UnsupervisedEstimator};
use crate::error::Failed;
use crate::linalg::{row_iter, Matrix};
use crate::math::distance::euclidian::Euclidian;
use crate::math::distance::{Distance, Distances};
use crate::math::num::RealNumber;
use crate::tree::decision_tree_classifier::which_max;
#[derive(Serialize, Deserialize, Debug)]
pub struct DBSCAN<T: RealNumber, D: Distance<Vec<T>, T>> {
cluster_labels: Vec<i16>,
num_classes: usize,
knn_algorithm: KNNAlgorithm<T, D>,
eps: T,
}
#[derive(Debug, Clone)]
pub struct DBSCANParameters<T: RealNumber, D: Distance<Vec<T>, T>> {
pub distance: D,
pub min_samples: usize,
pub eps: T,
pub algorithm: KNNAlgorithmName,
}
impl<T: RealNumber, D: Distance<Vec<T>, T>> DBSCANParameters<T, D> {
pub fn with_distance<DD: Distance<Vec<T>, T>>(self, distance: DD) -> DBSCANParameters<T, DD> {
DBSCANParameters {
distance,
min_samples: self.min_samples,
eps: self.eps,
algorithm: self.algorithm,
}
}
pub fn with_min_samples(mut self, min_samples: usize) -> Self {
self.min_samples = min_samples;
self
}
pub fn with_eps(mut self, eps: T) -> Self {
self.eps = eps;
self
}
pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self {
self.algorithm = algorithm;
self
}
}
impl<T: RealNumber, D: Distance<Vec<T>, T>> PartialEq for DBSCAN<T, D> {
fn eq(&self, other: &Self) -> bool {
self.cluster_labels.len() == other.cluster_labels.len()
&& self.num_classes == other.num_classes
&& self.eps == other.eps
&& self.cluster_labels == other.cluster_labels
}
}
impl<T: RealNumber> Default for DBSCANParameters<T, Euclidian> {
fn default() -> Self {
DBSCANParameters {
distance: Distances::euclidian(),
min_samples: 5,
eps: T::half(),
algorithm: KNNAlgorithmName::CoverTree,
}
}
}
impl<T: RealNumber + Sum, M: Matrix<T>, D: Distance<Vec<T>, T>>
UnsupervisedEstimator<M, DBSCANParameters<T, D>> for DBSCAN<T, D>
{
fn fit(x: &M, parameters: DBSCANParameters<T, D>) -> Result<Self, Failed> {
DBSCAN::fit(x, parameters)
}
}
impl<T: RealNumber, M: Matrix<T>, D: Distance<Vec<T>, T>> Predictor<M, M::RowVector>
for DBSCAN<T, D>
{
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
self.predict(x)
}
}
impl<T: RealNumber + Sum, D: Distance<Vec<T>, T>> DBSCAN<T, D> {
pub fn fit<M: Matrix<T>>(
x: &M,
parameters: DBSCANParameters<T, D>,
) -> Result<DBSCAN<T, D>, Failed> {
if parameters.min_samples < 1 {
return Err(Failed::fit(&"Invalid minPts".to_string()));
}
if parameters.eps <= T::zero() {
return Err(Failed::fit(&"Invalid radius: ".to_string()));
}
let mut k = 0;
let queued = -2;
let outlier = -1;
let undefined = -3;
let n = x.shape().0;
let mut y = vec![undefined; n];
let algo = parameters
.algorithm
.fit(row_iter(x).collect(), parameters.distance)?;
for (i, e) in row_iter(x).enumerate() {
if y[i] == undefined {
let mut neighbors = algo.find_radius(&e, parameters.eps)?;
if neighbors.len() < parameters.min_samples {
y[i] = outlier;
} else {
y[i] = k;
for j in 0..neighbors.len() {
if y[neighbors[j].0] == undefined {
y[neighbors[j].0] = queued;
}
}
while !neighbors.is_empty() {
let neighbor = neighbors.pop().unwrap();
let index = neighbor.0;
if y[index] == outlier {
y[index] = k;
}
if y[index] == undefined || y[index] == queued {
y[index] = k;
let secondary_neighbors =
algo.find_radius(neighbor.2, parameters.eps)?;
if secondary_neighbors.len() >= parameters.min_samples {
for j in 0..secondary_neighbors.len() {
let label = y[secondary_neighbors[j].0];
if label == undefined {
y[secondary_neighbors[j].0] = queued;
}
if label == undefined || label == outlier {
neighbors.push(secondary_neighbors[j]);
}
}
}
}
}
k += 1;
}
}
}
Ok(DBSCAN {
cluster_labels: y,
num_classes: k as usize,
knn_algorithm: algo,
eps: parameters.eps,
})
}
pub fn predict<M: Matrix<T>>(&self, x: &M) -> Result<M::RowVector, Failed> {
let (n, m) = x.shape();
let mut result = M::zeros(1, n);
let mut row = vec![T::zero(); m];
for i in 0..n {
x.copy_row_as_vec(i, &mut row);
let neighbors = self.knn_algorithm.find_radius(&row, self.eps)?;
let mut label = vec![0usize; self.num_classes + 1];
for neighbor in neighbors {
let yi = self.cluster_labels[neighbor.0];
if yi < 0 {
label[self.num_classes] += 1;
} else {
label[yi as usize] += 1;
}
}
let class = which_max(&label);
if class != self.num_classes {
result.set(0, i, T::from(class).unwrap());
} else {
result.set(0, i, -T::one());
}
}
Ok(result.to_row_vector())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::linalg::naive::dense_matrix::DenseMatrix;
use crate::math::distance::euclidian::Euclidian;
#[test]
fn fit_predict_dbscan() {
let x = DenseMatrix::from_2d_array(&[
&[1.0, 2.0],
&[1.1, 2.1],
&[0.9, 1.9],
&[1.2, 2.2],
&[0.8, 1.8],
&[2.0, 1.0],
&[2.1, 1.1],
&[1.9, 0.9],
&[2.2, 1.2],
&[1.8, 0.8],
&[3.0, 5.0],
]);
let expected_labels = vec![0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0];
let dbscan = DBSCAN::fit(
&x,
DBSCANParameters::default()
.with_eps(0.5)
.with_min_samples(2),
)
.unwrap();
let predicted_labels = dbscan.predict(&x).unwrap();
assert_eq!(expected_labels, predicted_labels);
}
#[test]
fn serde() {
let x = DenseMatrix::from_2d_array(&[
&[5.1, 3.5, 1.4, 0.2],
&[4.9, 3.0, 1.4, 0.2],
&[4.7, 3.2, 1.3, 0.2],
&[4.6, 3.1, 1.5, 0.2],
&[5.0, 3.6, 1.4, 0.2],
&[5.4, 3.9, 1.7, 0.4],
&[4.6, 3.4, 1.4, 0.3],
&[5.0, 3.4, 1.5, 0.2],
&[4.4, 2.9, 1.4, 0.2],
&[4.9, 3.1, 1.5, 0.1],
&[7.0, 3.2, 4.7, 1.4],
&[6.4, 3.2, 4.5, 1.5],
&[6.9, 3.1, 4.9, 1.5],
&[5.5, 2.3, 4.0, 1.3],
&[6.5, 2.8, 4.6, 1.5],
&[5.7, 2.8, 4.5, 1.3],
&[6.3, 3.3, 4.7, 1.6],
&[4.9, 2.4, 3.3, 1.0],
&[6.6, 2.9, 4.6, 1.3],
&[5.2, 2.7, 3.9, 1.4],
]);
let dbscan = DBSCAN::fit(&x, Default::default()).unwrap();
let deserialized_dbscan: DBSCAN<f64, Euclidian> =
serde_json::from_str(&serde_json::to_string(&dbscan).unwrap()).unwrap();
assert_eq!(dbscan, deserialized_dbscan);
}
}