kmedoids/silhouette.rs
1use crate::arrayadapter::ArrayAdapter;
2use core::ops::{AddAssign, Div, Sub};
3use num_traits::{Signed, Zero};
4use std::convert::From;
5
6/// Compute the Silhouette of a strict partitional clustering.
7///
8/// The Silhouette, proposed by Peter Rousseeuw in 1987, is a popular internal
9/// evaluation measure for clusterings. Although it is defined on arbitary metrics,
10/// it is most appropriate for evaluating "spherical" clusters, as it expects objects
11/// to be closer to all members of its own cluster than to members of other clusters.
12///
13/// Because of the additional requirement of a division operator, this implementation
14/// currently always returns a float result, and accepts only input distances that can be
15/// converted into floats.
16///
17/// * type `M` - matrix data type such as `ndarray::Array2` or `kmedoids::arrayadapter::LowerTriangle`
18/// * type `N` - number data type such as `u32` or `f64`
19/// * type `L` - number data type such as `f64` for the cost (use a float type)
20/// * `mat` - a pairwise distance matrix
21/// * `assi` - the cluster assignment
22/// * `samples` - whether to keep the individual samples, or not
23///
24/// returns a tuple containing:
25/// * the average silhouette
26/// * the individual silhouette values (empty if `samples = false`)
27///
28/// ## Panics
29///
30/// * panics when the dissimilarity matrix is not square
31///
32/// ## Example
33/// Given a dissimilarity matrix of size 4 x 4, use:
34/// ```
35/// let data = ndarray::arr2(&[[0,1,2,3],[1,0,4,5],[2,4,0,6],[3,5,6,0]]);
36/// let mut meds = kmedoids::random_initialization(4, 2, &mut rand::thread_rng());
37/// let (loss, assi, n_iter): (f64, _, _) = kmedoids::alternating(&data, &mut meds, 100);
38/// let (sil, _): (f64, _) = kmedoids::silhouette(&data, &assi, false);
39/// println!("Silhouette is: {}", sil);
40/// ```
41pub fn silhouette<M, N, L>(mat: &M, assi: &[usize], samples: bool) -> (L, Vec<L>)
42where
43 N: Zero + PartialOrd + Clone,
44 L: AddAssign
45 + Div<Output = L>
46 + Sub<Output = L>
47 + Signed
48 + Zero
49 + PartialOrd
50 + Clone
51 + From<N>
52 + From<u32>,
53 M: ArrayAdapter<N>,
54{
55 assert!(mat.is_square(), "Dissimilarity matrix is not square");
56 let mut sil = vec![L::zero(); if samples { assi.len() } else { 0}];
57 let mut lsum: L = L::zero();
58 let mut buf = Vec::<(u32, L)>::new();
59 for (i, &ai) in assi.iter().enumerate() {
60 buf.clear();
61 for (j, &aj) in assi.iter().enumerate() {
62 while aj >= buf.len() {
63 buf.push((0, L::zero()));
64 }
65 if i != j {
66 buf[aj].0 += 1;
67 buf[aj].1 += mat.get(i, j).into();
68 }
69 }
70 if buf.len() == 1 {
71 return (L::zero(), sil);
72 }
73 let s = if buf[ai].0 > 0 {
74 let a = checked_div(buf[ai].1.clone(), buf[ai].0.into());
75 let mut tmp = buf
76 .iter()
77 .enumerate()
78 .filter(|&(i, _)| i != ai)
79 .map(|(_, p)| checked_div(p.1.clone(), p.0.into()));
80 // Ugly hack to get the min():
81 let tmp2 = tmp.next().unwrap_or_else(L::zero);
82 let b = tmp.fold(tmp2, |x, y| if y < x { y } else { x });
83 checked_div(b.clone() - a.clone(), if a > b { a } else { b })
84 } else {
85 L::zero() // singleton
86 };
87 if samples {
88 sil[i] = s.clone();
89 }
90 lsum += s.clone();
91 }
92 if samples {
93 assert_eq!(sil.len(), assi.len(), "Length not as expected.");
94 }
95 (lsum.div((assi.len() as u32).into()), sil)
96}
97
98/// Compute the Medoid Silhouette of a clustering.
99///
100/// The Medoid Silhouette is an approximation to the original Silhouette where the
101/// distance to the cluster medoid is used instead of the average distance, hence reducing
102/// the run time from O(N²) to O(Nk). Here we assume that every object is assigned the
103/// nearest cluster, and hence only a distance matrix and a list of medoids is given.
104///
105/// Because of the additional requirement of a division operator, this implementation
106/// currently always returns a float result, and accepts only input distances that can be
107/// converted into floats.
108///
109/// TODO: allow using N x k distance matrixes, too.
110///
111/// * type `M` - matrix data type such as `ndarray::Array2` or `kmedoids::arrayadapter::LowerTriangle`
112/// * type `N` - number data type such as `u32` or `f64`
113/// * type `L` - number data type such as `f64` for the cost (use a float type)
114/// * `mat` - a pairwise distance matrix
115/// * `meds` - the medoid list
116/// * `samples` - whether to keep the individual samples, or not
117///
118/// returns a tuple containing:
119/// * the average medoid silhouette
120/// * the individual medoid silhouette values (empty if `samples = false`)
121///
122/// ## Panics
123///
124/// * panics when the dissimilarity matrix is not square
125///
126/// ## Example
127/// Given a dissimilarity matrix of size 4 x 4, use:
128/// ```
129/// let data = ndarray::arr2(&[[0,1,2,3],[1,0,4,5],[2,4,0,6],[3,5,6,0]]);
130/// let mut meds = kmedoids::random_initialization(4, 2, &mut rand::thread_rng());
131/// let (loss, assi, n_iter): (f64, _, _) = kmedoids::alternating(&data, &mut meds, 100);
132/// let (sil, _): (f64, _) = kmedoids::medoid_silhouette(&data, &meds, false);
133/// println!("Silhouette is: {}", sil);
134/// ```
135pub fn medoid_silhouette<M, N, L>(mat: &M, meds: &[usize], samples: bool) -> (L, Vec<L>)
136where
137 N: Zero + PartialOrd + Clone,
138 L: AddAssign
139 + Div<Output = L>
140 + Sub<Output = L>
141 + Signed
142 + Zero
143 + PartialOrd
144 + Clone
145 + From<N>
146 + From<u32>,
147 M: ArrayAdapter<N>,
148{
149 let (n, k) = (mat.len(), meds.len());
150 assert!(mat.is_square(), "Dissimilarity matrix is not square");
151 assert!(n <= u32::MAX as usize, "N is too large");
152 let mut sil = vec![L::one(); if samples { n } else { 0 }];
153 if k == 1 { return (L::one(), sil); } // not really well-defined
154 assert!(k <= n, "invalid k, must be over 1 and at most N");
155 let mut loss = L::zero();
156 #[allow(clippy::needless_range_loop)]
157 for i in 0..n {
158 let (d1, d2) = (mat.get(i, meds[0]), mat.get(i, meds[1]));
159 let mut best = if d1 < d2 { (d1, d2) } else { (d2, d1) };
160 for &m in meds.iter().skip(2) {
161 let d = mat.get(i, m);
162 if d < best.0 {
163 best = (d, best.0);
164 }
165 else if d < best.1 {
166 best = (best.0, d);
167 }
168 }
169 if !N::is_zero(&best.0) {
170 let s = <L as From<N>>::from(best.0.clone()) / <L as From<N>>::from(best.1.clone());
171 if samples { sil[i] = L::one() - s.clone(); }
172 loss += s;
173 }
174 }
175 loss = L::one() - loss / <L as From<u32>>::from(n as u32);
176 (loss, sil)
177}
178
179// helper function, returns 0 on division by 0
180pub(crate) fn checked_div<L>(x: L, y: L) -> L
181where
182 L: Div<Output = L> + Zero + Clone + PartialOrd,
183{
184 if y > L::zero() {
185 x.div(y)
186 } else {
187 L::zero()
188 }
189}