kodama/
union.rs

1use std::usize;
2
3use crate::dendrogram::Dendrogram;
4use crate::Method;
5
6/// A specialized implementation of union-find for linkage.
7///
8/// This union-find implementation represents a set of cluster labels. It
9/// supports fast lookups and fast unions.
10///
11/// This specific data structure is design to support cluster labels for a
12/// fixed set of observations. Namely, if there are `N` observations, then
13/// there are `N + N - 1` possible cluster labels.
14#[derive(Clone, Debug)]
15pub struct LinkageUnionFind {
16    /// A map from cluster label to its cluster's parent.
17    ///
18    /// When a cluster label is mapped to itself, then it is considered a
19    /// root.
20    parents: Vec<usize>,
21    /// The next cluster label to assign on the next union.
22    next_parent: usize,
23}
24
25impl Default for LinkageUnionFind {
26    fn default() -> LinkageUnionFind {
27        LinkageUnionFind::new()
28    }
29}
30
31impl LinkageUnionFind {
32    /// Create a new empty set.
33    pub fn new() -> LinkageUnionFind {
34        LinkageUnionFind::with_len(0)
35    }
36
37    /// Create a new set that can merge clusters for exactly `len`
38    /// observations.
39    pub fn with_len(len: usize) -> LinkageUnionFind {
40        let size = if len == 0 { 0 } else { 2 * len - 1 };
41        LinkageUnionFind { parents: (0..size).collect(), next_parent: len }
42    }
43
44    /// Clear this allocation and resize it as appropriate to support `len`
45    /// observations.
46    pub fn reset(&mut self, len: usize) {
47        let size = if len == 0 { 0 } else { 2 * len - 1 };
48        self.next_parent = len;
49        self.parents.resize(size, 0);
50        for (i, parent) in self.parents.iter_mut().enumerate() {
51            *parent = i;
52        }
53    }
54
55    /// Union the two clusters represented by the given labels.
56    ///
57    /// If the two clusters have already been merged, then this is a no-op.
58    pub fn union(&mut self, cluster1: usize, cluster2: usize) {
59        // If the clusters are already in the same set, then
60        // this is a no-op.
61        if self.find(cluster1) == self.find(cluster2) {
62            return;
63        }
64
65        assert!(self.next_parent < self.parents.len());
66        self.parents[cluster1] = self.next_parent;
67        self.parents[cluster2] = self.next_parent;
68        self.next_parent = self.next_parent + 1;
69    }
70
71    /// Return the root cluster label containing the cluster given.
72    pub fn find(&mut self, mut cluster: usize) -> usize {
73        // Find the parent of this cluster. The parent
74        // is the "label" of the cluster and is a root
75        // element.
76        let mut parent = cluster;
77        while let Some(p) = self.parent(parent) {
78            parent = p;
79        }
80        // To speed up subsequent calls to `find`, we
81        // set the parent of this cluster and all of its
82        // ancestors up to `parent`.
83        while let Some(p) = self.parent(cluster) {
84            self.parents[cluster] = parent;
85            cluster = p;
86        }
87        parent
88    }
89
90    /// Return the parent of the given cluster, if one exists. If the given
91    /// cluster is a root, then `None` is returned.
92    fn parent(&self, cluster: usize) -> Option<usize> {
93        let p = self.parents[cluster];
94        if p == cluster {
95            None
96        } else {
97            Some(p)
98        }
99    }
100
101    /// Relabel the cluster labels in each step of a complete dendrogram.
102    ///
103    /// If the given method requires the dendrogram to be sorted, then the
104    /// steps of the dendrogram are sorted by their dissimilarities.
105    pub fn relabel<T: PartialOrd>(
106        &mut self,
107        dendrogram: &mut Dendrogram<T>,
108        method: Method,
109    ) {
110        self.reset(dendrogram.observations());
111        if method.requires_sorting() {
112            dendrogram.steps_mut().sort_by(|step1, step2| {
113                // Floats have a partial ordering because of NaN. There's
114                // basically two reasonable things we could do here:
115                //
116                //   1. Panic if we find a NaN. A NaN dissimilarity between two
117                //      clusters probably indicates a bug somewhere, and it's
118                //      not clear when this could happen.
119                //   2. If we have a NaN. then cast to bit representation and
120                //      derive an ordering from that (or follow whatever IEEE
121                //      says).
122                //
123                // We choose door #1 because it likely indicates a bug, and
124                // we'd rather the bug fail loudly until we understand it
125                // enough that we can fix it some other way.
126                step1
127                    .dissimilarity
128                    .partial_cmp(&step2.dissimilarity)
129                    .expect("NaNs not allowed in dendrogram")
130            });
131        }
132        for i in 0..dendrogram.len() {
133            let new_cluster1 = self.find(dendrogram[i].cluster1);
134            let new_cluster2 = self.find(dendrogram[i].cluster2);
135            self.union(new_cluster1, new_cluster2);
136
137            let size1 = dendrogram.cluster_size(new_cluster1);
138            let size2 = dendrogram.cluster_size(new_cluster2);
139            dendrogram[i].set_clusters(new_cluster1, new_cluster2);
140            dendrogram[i].size = size1 + size2;
141        }
142    }
143}
144
145#[cfg(test)]
146mod tests {
147    use super::LinkageUnionFind;
148    use crate::dendrogram::{Dendrogram, Step};
149    use crate::Method;
150
151    #[test]
152    fn trivial_find() {
153        let mut set = LinkageUnionFind::with_len(5);
154        // In the trivial set, each member is its own cluster.
155        for i in 0..5 {
156            assert_eq!(i, set.find(i));
157        }
158    }
159
160    #[test]
161    fn find_with_unions() {
162        let mut set = LinkageUnionFind::with_len(5);
163
164        set.union(1, 3);
165        assert_eq!(0, set.find(0));
166        assert_eq!(5, set.find(1));
167        assert_eq!(2, set.find(2));
168        assert_eq!(5, set.find(3));
169        assert_eq!(4, set.find(4));
170        assert_eq!(5, set.find(5));
171
172        set.union(5, 2);
173        assert_eq!(0, set.find(0));
174        assert_eq!(6, set.find(1));
175        assert_eq!(6, set.find(2));
176        assert_eq!(6, set.find(3));
177        assert_eq!(4, set.find(4));
178        assert_eq!(6, set.find(5));
179        assert_eq!(6, set.find(6));
180
181        set.union(0, 4);
182        assert_eq!(7, set.find(0));
183        assert_eq!(6, set.find(1));
184        assert_eq!(6, set.find(2));
185        assert_eq!(6, set.find(3));
186        assert_eq!(7, set.find(4));
187        assert_eq!(6, set.find(5));
188        assert_eq!(6, set.find(6));
189        assert_eq!(7, set.find(7));
190
191        set.union(6, 7);
192        assert_eq!(8, set.find(0));
193        assert_eq!(8, set.find(1));
194        assert_eq!(8, set.find(2));
195        assert_eq!(8, set.find(3));
196        assert_eq!(8, set.find(4));
197        assert_eq!(8, set.find(5));
198        assert_eq!(8, set.find(6));
199        assert_eq!(8, set.find(7));
200    }
201
202    #[test]
203    fn find_with_unions_all_at_once() {
204        let mut set = LinkageUnionFind::with_len(5);
205
206        set.union(1, 3);
207        set.union(5, 2);
208        set.union(0, 4);
209        set.union(6, 7);
210
211        // The set is now full, so everything should be in the same cluster.
212        assert_eq!(8, set.find(0));
213        assert_eq!(8, set.find(1));
214        assert_eq!(8, set.find(2));
215        assert_eq!(8, set.find(3));
216        assert_eq!(8, set.find(4));
217        assert_eq!(8, set.find(5));
218        assert_eq!(8, set.find(6));
219        assert_eq!(8, set.find(7));
220    }
221
222    #[test]
223    fn union_is_idempotent() {
224        let mut set = LinkageUnionFind::with_len(5);
225
226        set.union(1, 3);
227        set.union(5, 2);
228        // `1` is already in the cluster `5`, so do a no-op union.
229        set.union(5, 1);
230        set.union(0, 4);
231        set.union(6, 7);
232
233        // The set is now full, so everything should be in the same cluster.
234        assert_eq!(8, set.find(0));
235        assert_eq!(8, set.find(1));
236        assert_eq!(8, set.find(2));
237        assert_eq!(8, set.find(3));
238        assert_eq!(8, set.find(4));
239        assert_eq!(8, set.find(5));
240        assert_eq!(8, set.find(6));
241        assert_eq!(8, set.find(7));
242
243        // Union two clusters already in the same cluster when the set is full.
244        set.union(1, 4);
245        assert_eq!(8, set.find(0));
246        assert_eq!(8, set.find(1));
247        assert_eq!(8, set.find(2));
248        assert_eq!(8, set.find(3));
249        assert_eq!(8, set.find(4));
250        assert_eq!(8, set.find(5));
251        assert_eq!(8, set.find(6));
252        assert_eq!(8, set.find(7));
253    }
254
255    #[test]
256    fn relabel() {
257        let mut den = Dendrogram::new(5);
258        den.push(Step::new(1, 3, 0.01, 0));
259        den.push(Step::new(1, 2, 0.02, 0));
260        den.push(Step::new(0, 4, 0.015, 0));
261        den.push(Step::new(1, 4, 0.03, 0));
262
263        let mut set = LinkageUnionFind::new();
264        set.relabel(&mut den, Method::Single);
265
266        assert_eq!(
267            den.steps(),
268            &[
269                Step::new(1, 3, 0.01, 2),
270                Step::new(0, 4, 0.015, 2),
271                Step::new(2, 5, 0.02, 3),
272                Step::new(6, 7, 0.03, 5),
273            ]
274        );
275    }
276}