1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
use crate::appx_dbscan::clustering::AppxDbscanLabeler;
use crate::appx_dbscan::{AppxDbscanParams, AppxDbscanValidParams};
use linfa::{traits::Transformer, DatasetBase, Float};
use ndarray::{Array1, ArrayBase, Data, Ix2};
#[cfg(feature = "serde")]
use serde_crate::{Deserialize, Serialize};
#[cfg_attr(
feature = "serde",
derive(Serialize, Deserialize),
serde(crate = "serde_crate")
)]
#[derive(Clone, Debug, PartialEq)]
/// DBSCAN (Density-based Spatial Clustering of Applications with Noise)
/// clusters together neighbouring points, while points in sparse regions are labelled
/// as noise. Since points may be part of a cluster or noise the transform method returns
/// `Array1<Option<usize>>`. It should be noted that some "border" points may technically
/// belong to more than one cluster but, since the transform function returns only one
/// label per point (if any), then only one cluster is chosen arbitrarily for those points.
///
/// As DBSCAN groups together points in dense regions, the number of clusters is
/// determined by the dataset and distance tolerance not the user.
///
/// This is an implementation of the approximated version of DBSCAN with
/// complexity O(N). Additional information can be found in
/// [this paper](https://www.cse.cuhk.edu.hk/~taoyf/paper/tods17-dbscan.pdf).
/// Beware of the hidden constant `O((1/slack)^dimensionality)` of the complexity
/// for very small values of `slack` and high dimensionalities.
/// The approximated version of the DBSCAN converges to the exact DBSCAN result
/// for a `slack` that goes to zero. Since only the `tranform` method is provided and
/// border points are not assigned deterministically, it may happen that the two
/// results still differ (in terms of border points) for very small values
/// of `slack`.
///
/// ## The algorithm
///
/// Let `d` be the the number of features of each point:
///
/// * The d-dimensional space is divided in a grid of cells, each containing the points of the dataset that fall
/// inside of them.
/// * All points that have at least `min_points` points in their neighbourhood of size `tolerance` are labeled as "core points". Every
/// cell containing at least one "core point" is labeled as a "core-cell".
/// * All "core cells" are set as vertexes of a graph and an arc is built between any two "core cells" according to the following rules:
/// * If there are two points in the two cells that have distance between them at most `tolerance` then the arc is added to the graph;
/// * If there are two points in the two cells that have distance between them that is between `tolerance` and `tolerance * (1 + slack)` then the arc is added arbitrarily to the graph;
/// * Otherwise the arc is not added;
/// * Every connected component of the graph is a cluster and every core point in the CC is given the label of that cluster.
/// * For all the non-core points in the cells we search for any neighbouring core point (with the same arbirtary rule used to create the arcs)
/// belonging to a cluster and, if at least one is found, then the non-core point is given the label of the cluster of the core-point. If no such core
/// point is found then the point is a "noise" point and it is given a label of `None`.
///
/// ## Tutorial
///
/// Let's do a walkthrough of an example running the approximated DBSCAN on some data.
///
/// ```rust
/// use linfa_clustering::{AppxDbscan, generate_blobs};
/// use linfa::traits::Transformer;
/// use ndarray::{Axis, array, s};
/// use ndarray_rand::rand::SeedableRng;
/// use rand_isaac::Isaac64Rng;
/// use approx::assert_abs_diff_eq;
///
/// // Our random number generator, seeded for reproducibility
/// let seed = 42;
/// let mut rng = Isaac64Rng::seed_from_u64(seed);
///
/// // `expected_centroids` has shape `(n_centroids, n_features)`
/// // i.e. three points in the 2-dimensional plane
/// let expected_centroids = array![[0., 1.], [-10., 20.], [-1., 10.]];
/// // Let's generate a synthetic dataset: three blobs of observations
/// // (100 points each) centered around our `expected_centroids`
/// let observations = generate_blobs(100, &expected_centroids, &mut rng);
///
/// // Let's configure and run our AppxDbscan algorithm
/// // We use the builder pattern to specify the hyperparameters
/// // `min_points` is the only mandatory parameter.
/// // If you don't specify the others (e.g. `tolerance`, `slack`)
/// // default values will be used.
/// let min_points = 3;
/// // Let's run the algorithm!
/// let params = AppxDbscan::params(min_points).tolerance(1e-2).slack(1e-3).transform(&observations).unwrap();
/// // Points are `None` if noise `Some(id)` if belonging to a cluster.
/// ```
///
pub struct AppxDbscan;
impl AppxDbscan {
/// Configures the hyperparameters with the minimum number of points required to form a cluster
///
/// Defaults are provided if the optional parameters are not specified:
/// * `tolerance = 1e-4`
/// * `slack = 1e-2`
pub fn params<F: Float>(min_points: usize) -> AppxDbscanParams<F> {
AppxDbscanParams::new(min_points)
}
}
impl<F: Float, D: Data<Elem = F>> Transformer<&ArrayBase<D, Ix2>, Array1<Option<usize>>>
for AppxDbscanValidParams<F>
{
fn transform(&self, observations: &ArrayBase<D, Ix2>) -> Array1<Option<usize>> {
if observations.dim().0 == 0 {
return Array1::from_elem(0, None);
}
let labeler = AppxDbscanLabeler::new(&observations.view(), self);
labeler.into_labels()
}
}
impl<F: Float, D: Data<Elem = F>, T>
Transformer<
DatasetBase<ArrayBase<D, Ix2>, T>,
DatasetBase<ArrayBase<D, Ix2>, Array1<Option<usize>>>,
> for AppxDbscanValidParams<F>
{
fn transform(
&self,
dataset: DatasetBase<ArrayBase<D, Ix2>, T>,
) -> DatasetBase<ArrayBase<D, Ix2>, Array1<Option<usize>>> {
let predicted = self.transform(dataset.records());
dataset.with_targets(predicted)
}
}