crate_activity/
hierarchical_clustering.rs

1crate::ix!();
2
3use std::collections::HashMap;
4
5/// Represents a hierarchical clustering dendrogram node.
6#[derive(Debug)]
7pub enum Dendrogram {
8    /// A leaf node representing a single crate.
9    Leaf {
10        /// Name of the crate represented by this leaf.
11        crate_name: String,
12    },
13    /// An internal node representing a merge of two clusters.
14    Internal {
15        /// Left child cluster.
16        left: Box<Dendrogram>,
17        /// Right child cluster.
18        right: Box<Dendrogram>,
19        /// The distance at which the two child clusters were merged.
20        distance: f64,
21    },
22}
23
24/// Errors that can occur during hierarchical clustering.
25#[derive(Debug)]
26pub enum HierarchicalClusteringError {
27    /// No crates provided for clustering.
28    NoCrates,
29    /// Inconsistent or incomplete data caused shape issues.
30    DataShapeError,
31    /// Insufficient correlation data.
32    IncompleteCorrelationData,
33    /// Other I/O or data-related issues.
34    IoError(std::io::Error),
35}
36
37/// Perform hierarchical clustering using single-linkage based on crate correlations.
38///
39/// # Arguments
40///
41/// * `correlations` - A vector of (crate_a, crate_b, correlation) tuples from `compute_pairwise_correlations`.
42///
43/// # Returns
44///
45/// A `Dendrogram` representing the hierarchical clustering structure.
46pub fn perform_hierarchical_clustering(
47    correlations: &[(String, String, f64)]
48) -> Result<Dendrogram, HierarchicalClusteringError> {
49
50    // Extract unique crate names from the correlation tuples.
51    let mut crate_set = HashMap::new();
52    for (a, b, _) in correlations {
53        crate_set.entry(a.clone()).or_insert(true);
54        crate_set.entry(b.clone()).or_insert(true);
55    }
56
57    let mut crate_names: Vec<String> = crate_set.keys().cloned().collect();
58    if crate_names.is_empty() {
59        // No crates at all means we cannot cluster.
60        return Err(HierarchicalClusteringError::NoCrates);
61    }
62    crate_names.sort(); // Ensure stable ordering of crates.
63
64    // Map crate names to indices
65    let index_map: HashMap<String, usize> = crate_names
66        .iter()
67        .enumerate()
68        .map(|(i, name)| (name.clone(), i))
69        .collect();
70
71    let n = crate_names.len();
72
73    // If there's only one crate, hierarchical clustering is trivial.
74    // Just return a single leaf node.
75    if n == 1 {
76        // Single crate scenario: just return a leaf, ignoring correlations.
77        return Ok(Dendrogram::Leaf {
78            crate_name: crate_names[0].clone(),
79        });
80    }
81
82    // Initialize a distance matrix.
83    // Default distance = 1.0 for missing pairs.
84    // Distance = 1 - correlation.
85    let mut distance_matrix = vec![1.0; n * n];
86
87    // Distance to self is zero.
88    for i in 0..n {
89        distance_matrix[i * n + i] = 0.0;
90    }
91
92    // Fill in distance matrix from correlations
93    // If not present, distance remains 1.0 (implying no correlation).
94    for (a, b, corr) in correlations {
95        if let (Some(&i), Some(&j)) = (index_map.get(a), index_map.get(b)) {
96            let dist = 1.0 - corr;
97            let idx1 = i * n + j;
98            let idx2 = j * n + i;
99            if idx1 < distance_matrix.len() && idx2 < distance_matrix.len() {
100                distance_matrix[idx1] = dist;
101                distance_matrix[idx2] = dist;
102            } else {
103                return Err(HierarchicalClusteringError::DataShapeError);
104            }
105        }
106    }
107
108    #[derive(Clone)]
109    struct Cluster {
110        indices: Vec<usize>,
111    }
112
113    // Each crate starts as its own cluster
114    let mut clusters: Vec<Cluster> = (0..n).map(|i| Cluster { indices: vec![i] }).collect();
115    let mut active = vec![true; n]; // which cluster IDs are active
116
117    // Each leaf node initially points to a leaf dendrogram
118    let mut dendrogram_nodes: Vec<Option<Dendrogram>> = crate_names
119        .iter()
120        .map(|name| Some(Dendrogram::Leaf { crate_name: name.clone() }))
121        .collect();
122
123    // Perform (n-1) merges
124    for _step in 0..(n-1) {
125        // Find the two closest distinct active clusters
126        let mut min_dist = f64::MAX;
127        let mut closest_pair = (0, 0);
128
129        for i in 0..n {
130            if !active[i] { continue; }
131            for j in (i+1)..n {
132                if !active[j] { continue; }
133
134                let dist = cluster_distance(&clusters[i].indices, &clusters[j].indices, &distance_matrix, n)?;
135                if dist < min_dist {
136                    min_dist = dist;
137                    closest_pair = (i, j);
138                }
139            }
140        }
141
142        let (c1, c2) = closest_pair;
143        let mut new_indices = Vec::new();
144        new_indices.extend_from_slice(&clusters[c1].indices);
145        new_indices.extend_from_slice(&clusters[c2].indices);
146
147        // Create a new dendrogram node from merging c1 and c2
148        let left_node = dendrogram_nodes[c1].take().ok_or(HierarchicalClusteringError::DataShapeError)?;
149        let right_node = dendrogram_nodes[c2].take().ok_or(HierarchicalClusteringError::DataShapeError)?;
150
151        let new_node = Dendrogram::Internal {
152            left: Box::new(left_node),
153            right: Box::new(right_node),
154            distance: min_dist,
155        };
156
157        // Merge c2 into c1 and deactivate c2
158        clusters[c1] = Cluster { indices: new_indices };
159        dendrogram_nodes[c1] = Some(new_node);
160        active[c2] = false;
161    }
162
163    // The final active cluster is our root
164    let final_node = dendrogram_nodes
165        .into_iter()
166        .enumerate()
167        .filter(|(i, _)| active[*i])
168        .map(|(_, node)| node)
169        .find(|n| n.is_some())
170        .ok_or(HierarchicalClusteringError::DataShapeError)?;
171
172    final_node.ok_or(HierarchicalClusteringError::DataShapeError)
173}
174
175/// Compute single-linkage distance between two clusters.
176fn cluster_distance(
177    c1: &impl AsRef<[usize]>,
178    c2: &impl AsRef<[usize]>,
179    distance_matrix: &[f64],
180    n: usize,
181) -> Result<f64, HierarchicalClusteringError> {
182    let mut min_dist = f64::MAX;
183    for &i in c1.as_ref() {
184        for &j in c2.as_ref() {
185            let idx = i*n + j;
186            if idx >= distance_matrix.len() {
187                return Err(HierarchicalClusteringError::DataShapeError);
188            }
189            let d = distance_matrix[idx];
190            if d < min_dist {
191                min_dist = d;
192            }
193        }
194    }
195    Ok(min_dist)
196}
197
198#[cfg(test)]
199mod hierarchical_clustering_tests {
200    use super::*;
201
202    fn correlation_tuple(a: &str, b: &str, corr: f64) -> (String, String, f64) {
203        (a.to_string(), b.to_string(), corr)
204    }
205
206    #[test]
207    fn test_no_crates() {
208        let correlations: Vec<(String, String, f64)> = Vec::new();
209        let result = perform_hierarchical_clustering(&correlations);
210        match result {
211            Err(HierarchicalClusteringError::NoCrates) => (),
212            _ => panic!("Expected NoCrates error for empty input."),
213        }
214    }
215
216    #[test]
217    fn test_single_crate() {
218        // Single crate means no pairwise correlations.
219        let correlations: Vec<(String, String, f64)> = Vec::new();
220        // Since no correlations provided, we cannot infer a second crate.
221        // So let's consider that no second crate was given at all.
222        // Actually, if we have only one crate, it must appear in correlations. Let's simulate that:
223        // If we only have one crate, we can't have correlations. We must handle this scenario
224        // by providing at least a mention of a crate (but there's no pair). The code
225        // currently extracts crates from correlation tuples only.
226        // To handle a single crate scenario properly, we need at least one correlation line referencing it.
227        // But with one crate, we can't form a pair. For now, let's assume this scenario is not
228        // possible unless we modify the code to accept crates separately from correlations.
229
230        // As a workaround, let's test one crate scenario by forcibly adding a self-pair with correlation=0.
231        let single_crate_corr = vec![
232            correlation_tuple("only_crate", "only_crate", 1.0) // This is artificial, but let's assume it.
233        ];
234
235        let result = perform_hierarchical_clustering(&single_crate_corr);
236        if let Ok(dendrogram) = result {
237            match dendrogram {
238                Dendrogram::Leaf { crate_name } => {
239                    assert_eq!(crate_name, "only_crate");
240                },
241                _ => panic!("Expected a leaf for a single crate."),
242            }
243        } else {
244            panic!("Expected success for single crate scenario.");
245        }
246    }
247
248    #[test]
249    fn test_two_crates_no_correlation() {
250        // Two distinct crates with zero correlation (distance=1.0)
251        let correlations = vec![correlation_tuple("crateA", "crateB", 0.0)];
252        let result = perform_hierarchical_clustering(&correlations);
253        if let Ok(dendrogram) = result {
254            // Expect a single internal node with two leaves
255            match dendrogram {
256                Dendrogram::Internal { left, right, distance } => {
257                    // Distance should be 1 - 0 = 1
258                    assert!((distance - 1.0).abs() < 1e-9);
259                    match (*left, *right) {
260                        (Dendrogram::Leaf { crate_name: ref c1 }, Dendrogram::Leaf { crate_name: ref c2 }) => {
261                            let mut crates = vec![c1.as_str(), c2.as_str()];
262                            crates.sort();
263                            assert_eq!(crates, vec!["crateA", "crateB"]);
264                        },
265                        _ => panic!("Expected two leaf nodes."),
266                    }
267                },
268                _ => panic!("Expected an Internal node for two crates."),
269            }
270        } else {
271            panic!("Expected success for two crates no correlation.");
272        }
273    }
274
275    #[test]
276    fn test_perfect_correlation() {
277        // Two identical crates with correlation=1.0
278        let correlations = vec![correlation_tuple("crateA", "crateB", 1.0)];
279        let result = perform_hierarchical_clustering(&correlations);
280        if let Ok(dendrogram) = result {
281            match dendrogram {
282                Dendrogram::Internal { distance, .. } => {
283                    // distance = 1 - corr = 0.0 since corr=1.0
284                    assert!((distance - 0.0).abs() < 1e-9);
285                },
286                _ => panic!("Expected Internal node for two crates."),
287            }
288        } else {
289            panic!("Expected success for perfect correlation.");
290        }
291    }
292
293    #[test]
294    fn test_three_crates_mixed_correlations() {
295        // crateA and crateB correlate 0.8 -> distance=0.2
296        // crateB and crateC correlate 0.3 -> distance=0.7
297        // crateA and crateC no entry => distance=1.0 by default
298        let correlations = vec![
299            correlation_tuple("crateA", "crateB", 0.8),
300            correlation_tuple("crateB", "crateC", 0.3),
301        ];
302        let result = perform_hierarchical_clustering(&correlations);
303        if let Ok(dendrogram) = result {
304            // We expect that the first merge will be between crateA and crateB (closest pair),
305            // then that cluster merges with crateC.
306            // The first merge distance: 1 - 0.8 = 0.2 (A-B)
307            // Then merging (A,B) cluster with C: min distance to C is via crateB (distance=0.7).
308            match dendrogram {
309                Dendrogram::Internal { left, right, distance: top_dist } => {
310                    // The top-level merge should be at distance=0.7
311                    assert!((top_dist - 0.7).abs() < 1e-9);
312
313                    // One side should be crateC leaf, the other side the A-B cluster
314                    let mut leaves = Vec::new();
315                    fn collect_leaves(d: &Dendrogram, leaves: &mut Vec<String>) {
316                        match d {
317                            Dendrogram::Leaf { crate_name } => leaves.push(crate_name.clone()),
318                            Dendrogram::Internal { left, right, .. } => {
319                                collect_leaves(left, leaves);
320                                collect_leaves(right, leaves);
321                            }
322                        }
323                    }
324
325                    collect_leaves(&*left, &mut leaves);
326                    collect_leaves(&*right, &mut leaves);
327
328                    leaves.sort();
329                    assert_eq!(leaves, vec!["crateA", "crateB", "crateC"]);
330                },
331                _ => panic!("Expected internal node at top."),
332            }
333        } else {
334            panic!("Expected success for three crates mixed correlations.");
335        }
336    }
337
338    #[test]
339    fn test_incomplete_correlation_data() {
340        // Suppose we have three crates, but only one correlation.
341        // This means some pairs are missing. Our code treats missing as distance=1.0.
342        // This should still be fine, not produce an error, just larger distances.
343        let correlations = vec![
344            correlation_tuple("crateX", "crateY", 0.5),
345        ];
346        // Should cluster all three crates (X, Y, and maybe a crateZ if we define one)
347        // Wait, we only have two crates defined above. For three crates test, define three in correlation.
348
349        // Actually, to simulate incomplete data: 
350        // Let's say we have crates: crateX, crateY, crateZ
351        // Provide correlation only for X-Y. Z is never mentioned.
352        let correlations = vec![
353            correlation_tuple("crateX", "crateY", 0.5),
354        ];
355        // Here crateZ is not in correlations at all, so no mention. We must provide it somehow.
356        // The code currently extracts crates only from correlation tuples. If we don't mention crateZ, it doesn't exist.
357        // To test incomplete correlation data meaningfully, we need at least mention crateZ with another crate.
358        // Let's do:
359        let correlations = vec![
360            correlation_tuple("crateX", "crateY", 0.5),
361            correlation_tuple("crateX", "crateZ", 0.0),  // X-Z defined, Y-Z missing
362        ];
363
364        // Now Y-Z is missing, so Y-Z distance = 1.0, X-Z distance=1.0, X-Y distance=0.5 => dist=0.5
365        let result = perform_hierarchical_clustering(&correlations);
366        if let Ok(dendrogram) = result {
367            // Just ensure it doesn't fail. Check we have three leaves total.
368            let mut leaves = Vec::new();
369            fn collect_leaves(d: &Dendrogram, leaves: &mut Vec<String>) {
370                match d {
371                    Dendrogram::Leaf { crate_name } => leaves.push(crate_name.clone()),
372                    Dendrogram::Internal { left, right, .. } => {
373                        collect_leaves(left, leaves);
374                        collect_leaves(right, leaves);
375                    }
376                }
377            }
378
379            collect_leaves(&dendrogram, &mut leaves);
380            leaves.sort();
381            assert_eq!(leaves, vec!["crateX", "crateY", "crateZ"]);
382        } else {
383            panic!("Expected success even with incomplete data (missing pairs).");
384        }
385    }
386
387    #[test]
388    fn test_many_crates_low_correlation() {
389        // Several crates, all with zero correlation => large distances.
390        // Just test performance & correctness, ensure no panic.
391        let crates = &["a", "b", "c", "d", "e"];
392        let mut correlations = Vec::new();
393        // minimal set of correlations with zero correlation
394        correlations.push(correlation_tuple("a", "b", 0.0));
395        correlations.push(correlation_tuple("b", "c", 0.0));
396        correlations.push(correlation_tuple("c", "d", 0.0));
397        correlations.push(correlation_tuple("d", "e", 0.0));
398        // Missing pairs means distance=1.0 anyway.
399
400        let result = perform_hierarchical_clustering(&correlations);
401        if let Ok(dendrogram) = result {
402            // Collect leaves
403            let mut leaves = Vec::new();
404            fn collect_leaves(d: &Dendrogram, leaves: &mut Vec<String>) {
405                match d {
406                    Dendrogram::Leaf { crate_name } => leaves.push(crate_name.clone()),
407                    Dendrogram::Internal { left, right, .. } => {
408                        collect_leaves(left, leaves);
409                        collect_leaves(right, leaves);
410                    }
411                }
412            }
413
414            collect_leaves(&dendrogram, &mut leaves);
415            leaves.sort();
416            assert_eq!(leaves, vec!["a", "b", "c", "d", "e"]);
417        } else {
418            panic!("Expected success with many crates low correlation.");
419        }
420    }
421
422    // Additional tests could simulate data shape errors by mocking functions or passing
423    // invalid states, but that requires controlling internal states which may not be trivial.
424    // The hierarchical clustering code is structured in a way that errors mainly occur on
425    // empty datasets or indexing issues. We've tested empty (no crates) scenario already.
426}
427