Skip to main content

aprender/cluster/
agglomerative.rs

1//! Agglomerative (bottom-up) hierarchical clustering.
2//!
3//! Builds a tree of clusters (dendrogram) by iteratively merging
4//! closest clusters using various linkage methods.
5
6use crate::error::Result;
7use crate::primitives::Matrix;
8use crate::traits::UnsupervisedEstimator;
9use serde::{Deserialize, Serialize};
10
11/// Linkage methods for hierarchical clustering.
12#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
13pub enum Linkage {
14    /// Minimum distance between clusters (single linkage).
15    Single,
16    /// Maximum distance between clusters (complete linkage).
17    Complete,
18    /// Average distance between all pairs (average linkage).
19    Average,
20    /// Minimize within-cluster variance (Ward's method).
21    Ward,
22}
23
24/// Dendrogram merge record.
25#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct Merge {
27    /// Clusters being merged.
28    pub clusters: (usize, usize),
29    /// Distance at which merge occurs.
30    pub distance: f32,
31    /// New cluster size after merge.
32    pub size: usize,
33}
34
35/// Agglomerative (bottom-up) hierarchical clustering.
36///
37/// Builds a tree of clusters (dendrogram) by iteratively merging
38/// closest clusters using various linkage methods.
39///
40/// # Algorithm
41///
42/// 1. Start with each point as its own cluster
43/// 2. Repeat until reaching `n_clusters`:
44///    - Find two closest clusters using linkage method
45///    - Merge them
46///    - Update distance matrix
47/// 3. Return cluster labels
48///
49/// # Examples
50///
51/// ```
52/// use aprender::prelude::*;
53///
54/// let data = Matrix::from_vec(6, 2, vec![
55///     1.0, 1.0, 1.1, 1.0, 1.0, 1.1,
56///     5.0, 5.0, 5.1, 5.0, 5.0, 5.1,
57/// ]).expect("Valid matrix dimensions and data length");
58///
59/// let mut hc = AgglomerativeClustering::new(2, Linkage::Average);
60/// hc.fit(&data).expect("Fit succeeds with valid data");
61///
62/// let labels = hc.predict(&data);
63/// assert_eq!(labels.len(), 6);
64/// ```
65///
66/// # Performance
67///
68/// - Time complexity: O(n³) for naive implementation
69/// - Space complexity: O(n²) for distance matrix
70#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct AgglomerativeClustering {
72    /// Target number of clusters.
73    n_clusters: usize,
74    /// Linkage method for distance calculation.
75    linkage: Linkage,
76    /// Cluster labels after fitting.
77    labels: Option<Vec<usize>>,
78    /// Dendrogram merge history.
79    dendrogram: Option<Vec<Merge>>,
80}
81
82impl AgglomerativeClustering {
83    /// Create new `AgglomerativeClustering` with target number of clusters and linkage method.
84    #[must_use]
85    pub fn new(n_clusters: usize, linkage: Linkage) -> Self {
86        Self {
87            n_clusters,
88            linkage,
89            labels: None,
90            dendrogram: None,
91        }
92    }
93
94    /// Get target number of clusters.
95    #[must_use]
96    pub fn n_clusters(&self) -> usize {
97        self.n_clusters
98    }
99
100    /// Get linkage method.
101    #[must_use]
102    pub fn linkage(&self) -> Linkage {
103        self.linkage
104    }
105
106    /// Check if model has been fitted.
107    #[must_use]
108    pub fn is_fitted(&self) -> bool {
109        self.labels.is_some()
110    }
111
112    /// Get cluster labels (panic if not fitted).
113    #[must_use]
114    pub fn labels(&self) -> &Vec<usize> {
115        self.labels
116            .as_ref()
117            .expect("Model not fitted. Call fit() first.")
118    }
119
120    /// Get dendrogram merge history (panic if not fitted).
121    #[must_use]
122    pub fn dendrogram(&self) -> &Vec<Merge> {
123        self.dendrogram
124            .as_ref()
125            .expect("Model not fitted. Call fit() first.")
126    }
127
128    /// ONE PATH: Core computation delegates to `nn::functional::euclidean_distance` (UCBD §4).
129    #[allow(clippy::unused_self)]
130    fn euclidean_distance(&self, x: &Matrix<f32>, i: usize, j: usize) -> f32 {
131        let n_features = x.shape().1;
132        let row_i: Vec<f32> = (0..n_features).map(|k| x.get(i, k)).collect();
133        let row_j: Vec<f32> = (0..n_features).map(|k| x.get(j, k)).collect();
134        crate::nn::functional::euclidean_distance(&row_i, &row_j)
135    }
136
137    /// Calculate pairwise distance matrix.
138    #[allow(clippy::needless_range_loop)]
139    fn pairwise_distances(&self, x: &Matrix<f32>) -> Vec<Vec<f32>> {
140        let n_samples = x.shape().0;
141        let mut distances = vec![vec![0.0; n_samples]; n_samples];
142        for i in 0..n_samples {
143            for j in (i + 1)..n_samples {
144                let dist = self.euclidean_distance(x, i, j);
145                distances[i][j] = dist;
146                distances[j][i] = dist;
147            }
148        }
149        distances
150    }
151
152    /// Find minimum distance between two active clusters.
153    #[allow(clippy::unused_self)]
154    fn find_closest_clusters(
155        &self,
156        distances: &[Vec<f32>],
157        active: &[bool],
158    ) -> (usize, usize, f32) {
159        let n = distances.len();
160        let mut min_dist = f32::INFINITY;
161        let mut min_i = 0;
162        let mut min_j = 1;
163
164        for i in 0..n {
165            if !active[i] {
166                continue;
167            }
168            for j in (i + 1)..n {
169                if !active[j] {
170                    continue;
171                }
172                if distances[i][j] < min_dist {
173                    min_dist = distances[i][j];
174                    min_i = i;
175                    min_j = j;
176                }
177            }
178        }
179
180        (min_i, min_j, min_dist)
181    }
182
183    /// Collect pairwise distances between two clusters.
184    fn pairwise_cluster_distances(
185        &self,
186        x: &Matrix<f32>,
187        cluster_a: &[usize],
188        cluster_b: &[usize],
189    ) -> Vec<f32> {
190        let mut dists = Vec::with_capacity(cluster_a.len() * cluster_b.len());
191        for &i in cluster_a {
192            for &j in cluster_b {
193                dists.push(self.euclidean_distance(x, i, j));
194            }
195        }
196        dists
197    }
198
199    /// Update distances for a newly merged cluster using specified linkage.
200    fn update_distances(
201        &self,
202        x: &Matrix<f32>,
203        distances: &mut [Vec<f32>],
204        clusters: &[Vec<usize>],
205        merged_idx: usize,
206        other_idx: usize,
207    ) {
208        let merged_cluster = &clusters[merged_idx];
209        let other_cluster = &clusters[other_idx];
210
211        let dist = match self.linkage {
212            Linkage::Single => {
213                let dists = self.pairwise_cluster_distances(x, merged_cluster, other_cluster);
214                dists.into_iter().fold(f32::INFINITY, f32::min)
215            }
216            Linkage::Complete => {
217                let dists = self.pairwise_cluster_distances(x, merged_cluster, other_cluster);
218                dists.into_iter().fold(0.0_f32, f32::max)
219            }
220            Linkage::Average => {
221                let dists = self.pairwise_cluster_distances(x, merged_cluster, other_cluster);
222                if dists.is_empty() {
223                    0.0
224                } else {
225                    dists.iter().sum::<f32>() / dists.len() as f32
226                }
227            }
228            Linkage::Ward => {
229                let merged_centroid = self.compute_centroid(x, merged_cluster);
230                let other_centroid = self.compute_centroid(x, other_cluster);
231                let centroid_dist =
232                    crate::nn::functional::euclidean_distance(&merged_centroid, &other_centroid);
233                let n1 = merged_cluster.len() as f32;
234                let n2 = other_cluster.len() as f32;
235                ((n1 * n2) / (n1 + n2)) * centroid_dist
236            }
237        };
238
239        distances[merged_idx][other_idx] = dist;
240        distances[other_idx][merged_idx] = dist;
241    }
242
243    /// Compute centroid of a cluster.
244    #[allow(clippy::needless_range_loop)]
245    #[allow(clippy::unused_self)]
246    fn compute_centroid(&self, x: &Matrix<f32>, cluster: &[usize]) -> Vec<f32> {
247        let n_features = x.shape().1;
248        let mut centroid = vec![0.0; n_features];
249
250        for &idx in cluster {
251            for k in 0..n_features {
252                centroid[k] += x.get(idx, k);
253            }
254        }
255
256        let size = cluster.len() as f32;
257        for val in &mut centroid {
258            *val /= size;
259        }
260
261        centroid
262    }
263}
264
265impl UnsupervisedEstimator for AgglomerativeClustering {
266    type Labels = Vec<usize>;
267
268    fn fit(&mut self, x: &Matrix<f32>) -> Result<()> {
269        let n_samples = x.shape().0;
270
271        // Initialize: each point is its own cluster
272        let mut clusters: Vec<Vec<usize>> = (0..n_samples).map(|i| vec![i]).collect();
273        let mut active = vec![true; n_samples];
274        let mut cluster_labels = vec![0; n_samples];
275        let mut dendrogram = Vec::new();
276
277        // Calculate initial pairwise distances
278        let mut distances = self.pairwise_distances(x);
279
280        // Merge until reaching target number of clusters
281        while clusters.iter().filter(|c| !c.is_empty()).count() > self.n_clusters {
282            // Find closest pair
283            let (i, j, dist) = self.find_closest_clusters(&distances, &active);
284
285            // Merge cluster j into cluster i
286            let merged_cluster = clusters[j].clone();
287            clusters[i].extend(&merged_cluster);
288            clusters[j].clear();
289            active[j] = false;
290
291            // Record merge in dendrogram
292            dendrogram.push(Merge {
293                clusters: (i, j),
294                distance: dist,
295                size: clusters[i].len(),
296            });
297
298            // Update distances for merged cluster
299            #[allow(clippy::needless_range_loop)]
300            for k in 0..n_samples {
301                if k == i || !active[k] {
302                    continue;
303                }
304                self.update_distances(x, &mut distances, &clusters, i, k);
305            }
306        }
307
308        // Assign labels
309        let mut cluster_id = 0;
310        for cluster in &clusters {
311            if !cluster.is_empty() {
312                for &point_idx in cluster {
313                    cluster_labels[point_idx] = cluster_id;
314                }
315                cluster_id += 1;
316            }
317        }
318
319        self.labels = Some(cluster_labels);
320        self.dendrogram = Some(dendrogram);
321        Ok(())
322    }
323
324    fn predict(&self, _x: &Matrix<f32>) -> Self::Labels {
325        // For hierarchical clustering, predict returns fitted labels
326        // (new points would require a different strategy)
327        self.labels().clone()
328    }
329}