kmedoids/
pam.rs

1use crate::arrayadapter::ArrayAdapter;
2use crate::fasterpam::{do_swap, initial_assignment};
3use crate::util::*;
4use core::ops::AddAssign;
5use num_traits::{Signed, Zero};
6use std::convert::From;
7
8/// Run the original PAM SWAP algorithm (no BUILD, but given initial medoids).
9///
10/// This is provided for academic reasons to see the performance difference.
11/// Quality-wise, FasterPAM is not worse on average, but much faster.
12/// FastPAM1 is supposed to do the same swaps, and find the same result, but faster.
13///
14/// * type `M` - matrix data type such as `ndarray::Array2` or `kmedoids::arrayadapter::LowerTriangle`
15/// * type `N` - number data type such as `u32` or `f64`
16/// * type `L` - number data type such as `i64` or `f64` for the loss (must be signed)
17/// * `mat` - a pairwise distance matrix
18/// * `med` - the list of medoids
19/// * `maxiter` - the maximum number of iterations allowed
20///
21/// returns a tuple containing:
22/// * the final loss
23/// * the final cluster assignment
24/// * the number of iterations needed
25/// * the number of swaps performed
26///
27/// ## Panics
28///
29/// * panics when the dissimilarity matrix is not square
30/// * panics when k is 0 or larger than N
31///
32/// ## Example
33/// Given a dissimilarity matrix of size 4 x 4, use:
34/// ```
35/// let data = ndarray::arr2(&[[0,1,2,3],[1,0,4,5],[2,4,0,6],[3,5,6,0]]);
36/// let mut meds = kmedoids::random_initialization(4, 2, &mut rand::thread_rng());
37/// let (loss, assi, n_iter, n_swap): (f64, _, _, _) = kmedoids::pam_swap(&data, &mut meds, 100);
38/// println!("Loss is: {}", loss);
39/// ```
40pub fn pam_swap<M, N, L>(
41	mat: &M,
42	med: &mut [usize],
43	maxiter: usize,
44) -> (L, Vec<usize>, usize, usize)
45where
46	N: Zero + PartialOrd + Clone,
47	L: AddAssign + Signed + Zero + PartialOrd + Clone + From<N>,
48	M: ArrayAdapter<N>,
49{
50	let (loss, mut data) = initial_assignment(mat, med);
51	pam_optimize(mat, med, &mut data, maxiter, loss)
52}
53
54/// Run the original PAM BUILD algorithm.
55///
56/// This is provided for academic reasons to see the performance difference.
57/// Quality-wise, FasterPAM yields better results than just BUILD.
58///
59/// * type `M` - matrix data type such as `ndarray::Array2` or `kmedoids::arrayadapter::LowerTriangle`
60/// * type `N` - number data type such as `u32` or `f64`
61/// * type `L` - number data type such as `i64` or `f64` for the loss (must be signed)
62/// * `mat` - a pairwise distance matrix
63/// * `k` - the number of medoids to pick
64///
65/// returns a tuple containing:
66/// * the initial loss
67/// * the initial cluster assignment
68/// * the initial medoids
69///
70/// ## Panics
71///
72/// * panics when the dissimilarity matrix is not square
73/// * panics when k is 0 or larger than N
74///
75/// ## Example
76/// Given a dissimilarity matrix of size 4 x 4, use:
77/// ```
78/// let data = ndarray::arr2(&[[0,1,2,3],[1,0,4,5],[2,4,0,6],[3,5,6,0]]);
79/// let (loss, assi, meds): (f64, _, _) = kmedoids::pam_build(&data, 2);
80/// println!("Loss is: {}", loss);
81/// ```
82pub fn pam_build<M, N, L>(mat: &M, k: usize) -> (L, Vec<usize>, Vec<usize>)
83where
84	N: Zero + PartialOrd + Clone,
85	L: AddAssign + Signed + Zero + PartialOrd + Clone + From<N>,
86	M: ArrayAdapter<N>,
87{
88	let n = mat.len();
89	assert!(mat.is_square(), "Dissimilarity matrix is not square");
90	assert!(n <= u32::MAX as usize, "N is too large");
91	assert!(k > 0 && k < u32::MAX as usize, "invalid N");
92	assert!(k <= n, "k must be at most N");
93	let mut meds = Vec::<usize>::with_capacity(k);
94	let mut data = Vec::<Rec<N>>::with_capacity(n);
95	let loss = pam_build_initialize(mat, &mut meds, &mut data, k);
96	let assi = data.iter().map(|x| x.near.i as usize).collect();
97	(loss, assi, meds)
98}
99
100/// Run the original PAM algorithm (BUILD and SWAP).
101///
102/// This is provided for academic reasons to see the performance difference.
103/// Quality-wise, FasterPAM is comparable to PAM, and much faster.
104///
105/// * type `M` - matrix data type such as `ndarray::Array2` or `kmedoids::arrayadapter::LowerTriangle`
106/// * type `N` - number data type such as `u32` or `f64`
107/// * type `L` - number data type such as `i64` or `f64` for the loss (must be signed)
108/// * `mat` - a pairwise distance matrix
109/// * `k` - the number of medoids to pick
110/// * `maxiter` - the maximum number of iterations allowed
111///
112/// returns a tuple containing:
113/// * the final loss
114/// * the final cluster assignment
115/// * the final medoids
116/// * the number of iterations needed
117/// * the number of swaps performed
118///
119/// ## Panics
120///
121/// * panics when the dissimilarity matrix is not square
122/// * panics when k is 0 or larger than N
123///
124/// ## Example
125/// Given a dissimilarity matrix of size 4 x 4, use:
126/// ```
127/// let data = ndarray::arr2(&[[0,1,2,3],[1,0,4,5],[2,4,0,6],[3,5,6,0]]);
128/// let (loss, assi, meds, n_iter, n_swap): (f64, _, _, _, _) = kmedoids::pam(&data, 2, 100);
129/// println!("Loss is: {}", loss);
130/// ```
131pub fn pam<M, N, L>(mat: &M, k: usize, maxiter: usize) -> (L, Vec<usize>, Vec<usize>, usize, usize)
132where
133	N: Zero + PartialOrd + Clone,
134	L: AddAssign + Signed + Zero + PartialOrd + Clone + From<N>,
135	M: ArrayAdapter<N>,
136{
137	let n = mat.len();
138	assert!(mat.is_square(), "Dissimilarity matrix is not square");
139	assert!(n <= u32::MAX as usize, "N is too large");
140	assert!(k > 0 && k < u32::MAX as usize, "invalid N");
141	assert!(k <= n, "k must be at most N");
142	let mut meds = Vec::<usize>::with_capacity(k);
143	let mut data = Vec::<Rec<N>>::with_capacity(n);
144	let loss = pam_build_initialize(mat, &mut meds, &mut data, k);
145	let (nloss, assi, n_iter, n_swap) = pam_optimize(mat, &mut meds, &mut data, maxiter, loss);
146	(nloss, assi, meds, n_iter, n_swap) // also return medoids
147}
148
149/// Main optimization function of PAM, not exposed (use pam_swap or pam)
150fn pam_optimize<M, N, L>(
151	mat: &M,
152	med: &mut [usize],
153	data: &mut [Rec<N>],
154	maxiter: usize,
155	mut loss: L,
156) -> (L, Vec<usize>, usize, usize)
157where
158	N: Zero + PartialOrd + Clone,
159	L: AddAssign + Signed + Zero + PartialOrd + Clone + From<N>,
160	M: ArrayAdapter<N>,
161{
162	let (n, k) = (mat.len(), med.len());
163	if k == 1 {
164		let assi = vec![0; n];
165		let (swapped, loss) = choose_medoid_within_partition::<M, N, L>(mat, &assi, med, 0);
166		return (loss, assi, 1, if swapped { 1 } else { 0 });
167	}
168	debug_assert_assignment(mat, med, data);
169	let (mut n_swaps, mut iter) = (0, 0);
170	while iter < maxiter {
171		iter += 1;
172		let mut best = (L::zero(), k, usize::MAX);
173		for j in 0..n {
174			if j == med[data[j].near.i as usize] {
175				continue; // This already is a medoid
176			}
177			let (change, b) = find_best_swap_pam(mat, med, data, j);
178			if change >= best.0 {
179				continue; // No improvement
180			}
181			best = (change, b, j);
182		}
183		if best.0 < L::zero() {
184			n_swaps += 1;
185			// perform the swap
186			let newloss = do_swap(mat, med, data, best.1, best.2);
187			if newloss >= loss {
188				break; // Probably numerically unstable now.
189			}
190			loss = newloss;
191		} else {
192			break; // No improvement, or NaN.
193		}
194	}
195	let assi = data.iter().map(|x| x.near.i as usize).collect();
196	(loss, assi, iter, n_swaps)
197}
198
199/// Find the best swap for object j - slower PAM version
200#[inline]
201fn find_best_swap_pam<M, N, L>(mat: &M, med: &[usize], data: &[Rec<N>], j: usize) -> (L, usize)
202where
203	N: Zero + PartialOrd + Clone,
204	L: AddAssign + Signed + Zero + PartialOrd + Clone + From<N>,
205	M: ArrayAdapter<N>,
206{
207	let recj = &data[j];
208	let mut best = (L::zero(), usize::MAX);
209	for (m, _) in med.iter().enumerate() {
210		let mut acc: L = -L::from(recj.near.d.clone()); // j becomes medoid
211		for (o, reco) in data.iter().enumerate() {
212			if o == j {
213				continue;
214			}
215			let doj = mat.get(o, j);
216			// Current medoid is being replaced:
217			if reco.near.i as usize == m {
218				if doj < reco.seco.d {
219					// Assign to new medoid:
220					acc += L::from(doj) - L::from(reco.near.d.clone())
221				} else {
222					// Assign to second nearest instead:
223					acc += L::from(reco.seco.d.clone()) - L::from(reco.near.d.clone())
224				}
225			} else if doj < reco.near.d {
226				// new mediod is closer:
227				acc += L::from(doj) - L::from(reco.near.d.clone())
228			} // else no change
229		}
230		if acc < best.0 {
231			best = (acc, m);
232		}
233	}
234	best
235}
236
237/// Not exposed. Use pam_build or pam.
238pub(crate) fn pam_build_initialize<M, N, L>(
239	mat: &M,
240	meds: &mut Vec<usize>,
241	data: &mut Vec<Rec<N>>,
242	k: usize,
243) -> L
244where
245	N: Zero + PartialOrd + Clone,
246	L: AddAssign + Signed + Zero + PartialOrd + Clone + From<N>,
247	M: ArrayAdapter<N>,
248{
249	let n = mat.len();
250	assert!(mat.is_square(), "Dissimilarity matrix is not square");
251	// choose first medoid
252	let mut best = (L::zero(), k);
253	for i in 0..n {
254		let mut sum = L::zero();
255		for j in 0..n {
256			if j != i {
257				sum += L::from(mat.get(j, i));
258			}
259		}
260		if i == 0 || sum < best.0 {
261			best = (sum, i);
262		}
263	}
264	let mut loss = best.0;
265	meds.push(best.1);
266	for j in 0..n {
267		data.push(Rec::new(0, mat.get(j, best.1), u32::MAX, N::zero()));
268	}
269	// choose remaining medoids
270	for l in 1..k {
271		best = (L::zero(), k);
272		for (i, _) in data.iter().enumerate() {
273			let mut sum = -L::from(data[i].near.d.clone());
274			for (j, dj) in data.iter().enumerate() {
275				if j != i {
276					let d = mat.get(j, i);
277					if d < dj.near.d {
278						sum += L::from(d) - L::from(dj.near.d.clone())
279					}
280				}
281			}
282			if i == 0 || sum < best.0 {
283				best = (sum, i);
284			}
285		}
286		if best.0 >= L::zero() { break; } // No further improvements - duplicates etc.
287		// Update assignments:
288		loss = L::zero();
289		for (j, recj) in data.iter_mut().enumerate() {
290			if j == best.1 {
291				recj.seco = recj.near.clone();
292				recj.near = DistancePair::new(l as u32, N::zero());
293				continue;
294			}
295			let dj = mat.get(j, best.1);
296			if dj < recj.near.d {
297				recj.seco = recj.near.clone();
298				recj.near = DistancePair::new(l as u32, dj);
299			} else if recj.seco.i == u32::MAX || dj < recj.seco.d {
300				recj.seco = DistancePair::new(l as u32, dj);
301			}
302			loss += L::from(recj.near.d.clone());
303		}
304		meds.push(best.1);
305	}
306	loss
307}
308
309#[cfg(test)]
310mod tests {
311	// TODO: use a larger, much more interesting example.
312	use crate::{
313		arrayadapter::LowerTriangle, pam, pam_build, pam_swap, silhouette, util::assert_array,
314	};
315
316	#[test]
317	fn test_pam_swap_simple() {
318		let data = LowerTriangle {
319			n: 5,
320			data: vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 1],
321		};
322		let mut meds = vec![0, 1];
323		let (loss, assi, n_iter, n_swap): (i64, _, _, _) = pam_swap(&data, &mut meds, 10);
324		let (sil, _): (f64, _) = silhouette(&data, &assi, false);
325		assert_eq!(loss, 4, "loss not as expected");
326		assert_eq!(n_swap, 1, "swaps not as expected");
327		assert_eq!(n_iter, 2, "iterations not as expected");
328		assert_array(assi, vec![0, 0, 0, 1, 1], "assignment not as expected");
329		assert_array(meds, vec![0, 3], "medoids not as expected");
330		assert_eq!(sil, 0.7522494172494172, "Silhouette not as expected");
331	}
332
333	#[test]
334	fn test_pam_build_simple() {
335		let data = LowerTriangle {
336			n: 5,
337			data: vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 1],
338		};
339		let (loss, assi, meds): (i64, _, _) = pam_build(&data, 2);
340		let (sil, _): (f64, _) = silhouette(&data, &assi, false);
341		assert_eq!(loss, 4, "loss not as expected");
342		assert_array(assi, vec![0, 0, 0, 1, 1], "assignment not as expected");
343		assert_array(meds, vec![0, 3], "medoids not as expected");
344		assert_eq!(sil, 0.7522494172494172, "Silhouette not as expected");
345	}
346
347	#[test]
348	fn test_pam_simple() {
349		let data = LowerTriangle {
350			n: 5,
351			data: vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 1],
352		};
353		let (loss, assi, meds, n_iter, n_swap): (i64, _, _, _, _) = pam(&data, 2, 10);
354		let (sil, _): (f64, _) = silhouette(&data, &assi, false);
355		// no swaps, because BUILD does a decent job
356		assert_eq!(n_swap, 0, "swaps not as expected");
357		assert_eq!(n_iter, 1, "iterations not as expected");
358		assert_eq!(loss, 4, "loss not as expected");
359		assert_array(assi, vec![0, 0, 0, 1, 1], "assignment not as expected");
360		assert_array(meds, vec![0, 3], "medoids not as expected");
361		assert_eq!(sil, 0.7522494172494172, "Silhouette not as expected");
362	}
363}