1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
use std::mem;
use std::ops;

use num_traits::Float;

/// A step-wise dendrogram that represents a hierarchical clustering as a
/// binary tree.
///
/// A dendrogram consists of a series of `N - 1` steps, where `N` is the number
/// of observations that were clustered. Each step corresponds to a merge
/// between two other clusters (where a cluster might consist of one or more
/// observations). Each step includes the labels for the pair of clusters that
/// were merged, the number of total observations in the new merged cluster
/// and the dissimilarity between the two merged clusters.
///
/// The labels of clusters are assigned as follows:
///
/// 1. A cluster that corresponds to a single observation is assigned a label
///    that corresponds to the given observation's index in the pairwise
///    dissimilarity matrix.
/// 2. A cluster with more than one observation has the label `N + i`, where
///    `N` is the total number of observations and `i` corresponds to the the
///    `i`th step in which the cluster was created. So for example, the very
///    first step in a dendrogram creates a cluster with the label `N` and the
///    last step in a dendrogram creates a cluster with the label
///    `(N + N - 1) - 1` (since there are always `N - 1` steps in a
///    dendrogram).
///
/// This labeling scheme corresponds to the same labeling scheme used by SciPy.
///
/// The type parameter `T` refers to the type of dissimilarity used in the
/// steps. In practice, `T` is a floating point type.
#[derive(Debug, Eq, Hash, PartialEq)]
pub struct Dendrogram<T> {
    steps: Vec<Step<T>>,
    observations: usize,
}

/// A single merge step in a dendrogram.
///
/// A step always corresponds to a merge between two clusters, where each
/// cluster has at least one observation. Each step itself corresponds to a new
/// cluster containing the observations of the merged clusters.
///
/// By convention, the smaller label is assigned to `cluster1`.
///
/// The type parameter `T` refers to the type of dissimilarity used. In
/// practice, `T` is a floating point type.
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub struct Step<T> {
    /// The label corresponding to the first cluster.
    ///
    /// The algorithm for labeling clusters is documented on
    /// [`Dendrogram`](struct.Dendrogram.html).
    pub cluster1: usize,
    /// The label corresponding to the second cluster.
    ///
    /// The algorithm for labeling clusters is documented on
    /// [`Dendrogram`](struct.Dendrogram.html).
    pub cluster2: usize,
    /// The dissimilarity between `cluster1` and `cluster2`.
    ///
    /// If both `cluster1` and `cluster2` correspond to singleton clusters,
    /// then this dissimilarity is equivalent to the pairwise dissimilarity
    /// between the clusters' corresponding observations. Otherwise, the
    /// dissimilarity is computed according to the clustering
    /// [`Method`](enum.Method.html) used.
    pub dissimilarity: T,
    /// The total number of observations in this merged cluster. This is
    /// always equivalent to the total number of observations in `cluster1`
    /// plus the total number of observations in `cluster2`.
    pub size: usize,
}

impl<T> Dendrogram<T> {
    /// Return a new dendrogram with space for the given number of
    /// observations.
    pub fn new(observations: usize) -> Dendrogram<T> {
        Dendrogram {
            steps: Vec::with_capacity(observations),
            observations: observations,
        }
    }

    /// Clear this dendrogram and ensure there is space for the given number
    /// of observations.
    ///
    /// This method is useful for reusing a dendrogram's allocation.
    ///
    /// Note that this method does not need to be called before passing it to
    /// one of the clustering functions. The clustering functions will reset
    /// the dendrogram for you.
    pub fn reset(&mut self, observations: usize) {
        self.steps.clear();
        self.observations = observations;
    }

    /// Push a new step on to this dendrogram.
    ///
    /// # Panics
    ///
    /// This method panics if the dendrogram has `N - 1` steps, where `N` is
    /// the number of observations supported by this dendrogram.
    pub fn push(&mut self, step: Step<T>) {
        assert!(self.len() < self.observations().saturating_sub(1));
        self.steps.push(step);
    }

    /// Returns the steps in the dendrogram.
    pub fn steps(&self) -> &[Step<T>] {
        &self.steps
    }

    /// Return a mutable slice of the steps in this dendrogram.
    pub fn steps_mut(&mut self) -> &mut [Step<T>] {
        &mut self.steps
    }

    /// Return the number of steps in this dendrogram.
    pub fn len(&self) -> usize {
        self.steps.len()
    }

    /// Return true if and only if this dendrogram has no steps.
    pub fn is_empty(&self) -> bool {
        self.steps.is_empty()
    }

    /// Return the number of observations that this dendrogram supports.
    pub fn observations(&self) -> usize {
        self.observations
    }

    /// Returns the total number of observations in the cluster identified by
    /// the following label.
    ///
    /// The label may be any value in the half-open interval
    /// `[0, N + N - 1)`, where `N` is the total number of observations.
    pub fn cluster_size(&self, label: usize) -> usize {
        if label < self.observations() {
            1
        } else {
            self[label - self.observations()].size
        }
    }
}

impl<T: Float> Dendrogram<T> {
    /// Compare two dendrograms for approximate equality.
    ///
    /// Approximate equality in this case refers to the dissimilarities in each
    /// step. In particular, two dissimilarities are considered equal if and
    /// only if the absolute value of their difference is less than or equal to
    /// the given `epsilon` value.
    pub fn eq_with_epsilon(&self, other: &Dendrogram<T>, epsilon: T) -> bool {
        if self.len() != other.len() {
            return false;
        }
        for (s1, s2) in self.steps().iter().zip(other.steps()) {
            if !s1.eq_with_epsilon(s2, epsilon) {
                return false;
            }
        }
       true
    }
}

impl<T> ops::Index<usize> for Dendrogram<T> {
    type Output = Step<T>;
    fn index(&self, i: usize) -> &Step<T> { &self.steps[i] }
}

impl<T> ops::IndexMut<usize> for Dendrogram<T> {
    fn index_mut(&mut self, i: usize) -> &mut Step<T> { &mut self.steps[i] }
}

impl<T> Step<T> {
    /// Create a new a step that can be added to a dendrogram.
    ///
    /// Note that the clustering labels given are normalized such that the
    /// smallest label is always assigned to `cluster1`.
    pub fn new(
        mut cluster1: usize,
        mut cluster2: usize,
        dissimilarity: T,
        size: usize,
    ) -> Step<T> {
        if cluster2 < cluster1 {
            mem::swap(&mut cluster1, &mut cluster2);
        }
        Step {
            cluster1: cluster1,
            cluster2: cluster2,
            dissimilarity: dissimilarity,
            size: size,
        }
    }

    /// Set the cluster labels on this step.
    ///
    /// Note that the clustering labels given are normalized such that the
    /// smallest label is always assigned to `cluster1`.
    pub fn set_clusters(&mut self, mut cluster1: usize, mut cluster2: usize) {
        if cluster2 < cluster1 {
            mem::swap(&mut cluster1, &mut cluster2);
        }
        self.cluster1 = cluster1;
        self.cluster2 = cluster2;
    }
}

impl<T: Float> Step<T> {
    /// Compare two steps for approximate equality.
    ///
    /// Approximate equality in this case refers to the dissimilarity in each
    /// step. In particular, two dissimilarity are considered equal if and only
    /// if the absolute value of their difference is less than or equal to the
    /// given `epsilon` value.
    pub fn eq_with_epsilon(&self, other: &Step<T>, epsilon: T) -> bool {
        if self == other {
            return true;
        }
        let key1 = (self.cluster1, self.cluster2, self.size);
        let key2 = (other.cluster1, other.cluster2, other.size);
        if key1 != key2 {
            return false;
        }
        if (self.dissimilarity - other.dissimilarity).abs() > epsilon {
            return false;
        }
        true
    }
}