1crate::ix!();
2
3use std::collections::HashMap;
4
5#[derive(Debug)]
7pub enum Dendrogram {
8 Leaf {
10 crate_name: String,
12 },
13 Internal {
15 left: Box<Dendrogram>,
17 right: Box<Dendrogram>,
19 distance: f64,
21 },
22}
23
24#[derive(Debug)]
26pub enum HierarchicalClusteringError {
27 NoCrates,
29 DataShapeError,
31 IncompleteCorrelationData,
33 IoError(std::io::Error),
35}
36
37pub fn perform_hierarchical_clustering(
47 correlations: &[(String, String, f64)]
48) -> Result<Dendrogram, HierarchicalClusteringError> {
49
50 let mut crate_set = HashMap::new();
52 for (a, b, _) in correlations {
53 crate_set.entry(a.clone()).or_insert(true);
54 crate_set.entry(b.clone()).or_insert(true);
55 }
56
57 let mut crate_names: Vec<String> = crate_set.keys().cloned().collect();
58 if crate_names.is_empty() {
59 return Err(HierarchicalClusteringError::NoCrates);
61 }
62 crate_names.sort(); let index_map: HashMap<String, usize> = crate_names
66 .iter()
67 .enumerate()
68 .map(|(i, name)| (name.clone(), i))
69 .collect();
70
71 let n = crate_names.len();
72
73 if n == 1 {
76 return Ok(Dendrogram::Leaf {
78 crate_name: crate_names[0].clone(),
79 });
80 }
81
82 let mut distance_matrix = vec![1.0; n * n];
86
87 for i in 0..n {
89 distance_matrix[i * n + i] = 0.0;
90 }
91
92 for (a, b, corr) in correlations {
95 if let (Some(&i), Some(&j)) = (index_map.get(a), index_map.get(b)) {
96 let dist = 1.0 - corr;
97 let idx1 = i * n + j;
98 let idx2 = j * n + i;
99 if idx1 < distance_matrix.len() && idx2 < distance_matrix.len() {
100 distance_matrix[idx1] = dist;
101 distance_matrix[idx2] = dist;
102 } else {
103 return Err(HierarchicalClusteringError::DataShapeError);
104 }
105 }
106 }
107
108 #[derive(Clone)]
109 struct Cluster {
110 indices: Vec<usize>,
111 }
112
113 let mut clusters: Vec<Cluster> = (0..n).map(|i| Cluster { indices: vec![i] }).collect();
115 let mut active = vec![true; n]; let mut dendrogram_nodes: Vec<Option<Dendrogram>> = crate_names
119 .iter()
120 .map(|name| Some(Dendrogram::Leaf { crate_name: name.clone() }))
121 .collect();
122
123 for _step in 0..(n-1) {
125 let mut min_dist = f64::MAX;
127 let mut closest_pair = (0, 0);
128
129 for i in 0..n {
130 if !active[i] { continue; }
131 for j in (i+1)..n {
132 if !active[j] { continue; }
133
134 let dist = cluster_distance(&clusters[i].indices, &clusters[j].indices, &distance_matrix, n)?;
135 if dist < min_dist {
136 min_dist = dist;
137 closest_pair = (i, j);
138 }
139 }
140 }
141
142 let (c1, c2) = closest_pair;
143 let mut new_indices = Vec::new();
144 new_indices.extend_from_slice(&clusters[c1].indices);
145 new_indices.extend_from_slice(&clusters[c2].indices);
146
147 let left_node = dendrogram_nodes[c1].take().ok_or(HierarchicalClusteringError::DataShapeError)?;
149 let right_node = dendrogram_nodes[c2].take().ok_or(HierarchicalClusteringError::DataShapeError)?;
150
151 let new_node = Dendrogram::Internal {
152 left: Box::new(left_node),
153 right: Box::new(right_node),
154 distance: min_dist,
155 };
156
157 clusters[c1] = Cluster { indices: new_indices };
159 dendrogram_nodes[c1] = Some(new_node);
160 active[c2] = false;
161 }
162
163 let final_node = dendrogram_nodes
165 .into_iter()
166 .enumerate()
167 .filter(|(i, _)| active[*i])
168 .map(|(_, node)| node)
169 .find(|n| n.is_some())
170 .ok_or(HierarchicalClusteringError::DataShapeError)?;
171
172 final_node.ok_or(HierarchicalClusteringError::DataShapeError)
173}
174
175fn cluster_distance(
177 c1: &impl AsRef<[usize]>,
178 c2: &impl AsRef<[usize]>,
179 distance_matrix: &[f64],
180 n: usize,
181) -> Result<f64, HierarchicalClusteringError> {
182 let mut min_dist = f64::MAX;
183 for &i in c1.as_ref() {
184 for &j in c2.as_ref() {
185 let idx = i*n + j;
186 if idx >= distance_matrix.len() {
187 return Err(HierarchicalClusteringError::DataShapeError);
188 }
189 let d = distance_matrix[idx];
190 if d < min_dist {
191 min_dist = d;
192 }
193 }
194 }
195 Ok(min_dist)
196}
197
198#[cfg(test)]
199mod hierarchical_clustering_tests {
200 use super::*;
201
202 fn correlation_tuple(a: &str, b: &str, corr: f64) -> (String, String, f64) {
203 (a.to_string(), b.to_string(), corr)
204 }
205
206 #[test]
207 fn test_no_crates() {
208 let correlations: Vec<(String, String, f64)> = Vec::new();
209 let result = perform_hierarchical_clustering(&correlations);
210 match result {
211 Err(HierarchicalClusteringError::NoCrates) => (),
212 _ => panic!("Expected NoCrates error for empty input."),
213 }
214 }
215
216 #[test]
217 fn test_single_crate() {
218 let correlations: Vec<(String, String, f64)> = Vec::new();
220 let single_crate_corr = vec![
232 correlation_tuple("only_crate", "only_crate", 1.0) ];
234
235 let result = perform_hierarchical_clustering(&single_crate_corr);
236 if let Ok(dendrogram) = result {
237 match dendrogram {
238 Dendrogram::Leaf { crate_name } => {
239 assert_eq!(crate_name, "only_crate");
240 },
241 _ => panic!("Expected a leaf for a single crate."),
242 }
243 } else {
244 panic!("Expected success for single crate scenario.");
245 }
246 }
247
248 #[test]
249 fn test_two_crates_no_correlation() {
250 let correlations = vec![correlation_tuple("crateA", "crateB", 0.0)];
252 let result = perform_hierarchical_clustering(&correlations);
253 if let Ok(dendrogram) = result {
254 match dendrogram {
256 Dendrogram::Internal { left, right, distance } => {
257 assert!((distance - 1.0).abs() < 1e-9);
259 match (*left, *right) {
260 (Dendrogram::Leaf { crate_name: ref c1 }, Dendrogram::Leaf { crate_name: ref c2 }) => {
261 let mut crates = vec![c1.as_str(), c2.as_str()];
262 crates.sort();
263 assert_eq!(crates, vec!["crateA", "crateB"]);
264 },
265 _ => panic!("Expected two leaf nodes."),
266 }
267 },
268 _ => panic!("Expected an Internal node for two crates."),
269 }
270 } else {
271 panic!("Expected success for two crates no correlation.");
272 }
273 }
274
275 #[test]
276 fn test_perfect_correlation() {
277 let correlations = vec![correlation_tuple("crateA", "crateB", 1.0)];
279 let result = perform_hierarchical_clustering(&correlations);
280 if let Ok(dendrogram) = result {
281 match dendrogram {
282 Dendrogram::Internal { distance, .. } => {
283 assert!((distance - 0.0).abs() < 1e-9);
285 },
286 _ => panic!("Expected Internal node for two crates."),
287 }
288 } else {
289 panic!("Expected success for perfect correlation.");
290 }
291 }
292
293 #[test]
294 fn test_three_crates_mixed_correlations() {
295 let correlations = vec![
299 correlation_tuple("crateA", "crateB", 0.8),
300 correlation_tuple("crateB", "crateC", 0.3),
301 ];
302 let result = perform_hierarchical_clustering(&correlations);
303 if let Ok(dendrogram) = result {
304 match dendrogram {
309 Dendrogram::Internal { left, right, distance: top_dist } => {
310 assert!((top_dist - 0.7).abs() < 1e-9);
312
313 let mut leaves = Vec::new();
315 fn collect_leaves(d: &Dendrogram, leaves: &mut Vec<String>) {
316 match d {
317 Dendrogram::Leaf { crate_name } => leaves.push(crate_name.clone()),
318 Dendrogram::Internal { left, right, .. } => {
319 collect_leaves(left, leaves);
320 collect_leaves(right, leaves);
321 }
322 }
323 }
324
325 collect_leaves(&*left, &mut leaves);
326 collect_leaves(&*right, &mut leaves);
327
328 leaves.sort();
329 assert_eq!(leaves, vec!["crateA", "crateB", "crateC"]);
330 },
331 _ => panic!("Expected internal node at top."),
332 }
333 } else {
334 panic!("Expected success for three crates mixed correlations.");
335 }
336 }
337
338 #[test]
339 fn test_incomplete_correlation_data() {
340 let correlations = vec![
344 correlation_tuple("crateX", "crateY", 0.5),
345 ];
346 let correlations = vec![
353 correlation_tuple("crateX", "crateY", 0.5),
354 ];
355 let correlations = vec![
360 correlation_tuple("crateX", "crateY", 0.5),
361 correlation_tuple("crateX", "crateZ", 0.0), ];
363
364 let result = perform_hierarchical_clustering(&correlations);
366 if let Ok(dendrogram) = result {
367 let mut leaves = Vec::new();
369 fn collect_leaves(d: &Dendrogram, leaves: &mut Vec<String>) {
370 match d {
371 Dendrogram::Leaf { crate_name } => leaves.push(crate_name.clone()),
372 Dendrogram::Internal { left, right, .. } => {
373 collect_leaves(left, leaves);
374 collect_leaves(right, leaves);
375 }
376 }
377 }
378
379 collect_leaves(&dendrogram, &mut leaves);
380 leaves.sort();
381 assert_eq!(leaves, vec!["crateX", "crateY", "crateZ"]);
382 } else {
383 panic!("Expected success even with incomplete data (missing pairs).");
384 }
385 }
386
387 #[test]
388 fn test_many_crates_low_correlation() {
389 let crates = &["a", "b", "c", "d", "e"];
392 let mut correlations = Vec::new();
393 correlations.push(correlation_tuple("a", "b", 0.0));
395 correlations.push(correlation_tuple("b", "c", 0.0));
396 correlations.push(correlation_tuple("c", "d", 0.0));
397 correlations.push(correlation_tuple("d", "e", 0.0));
398 let result = perform_hierarchical_clustering(&correlations);
401 if let Ok(dendrogram) = result {
402 let mut leaves = Vec::new();
404 fn collect_leaves(d: &Dendrogram, leaves: &mut Vec<String>) {
405 match d {
406 Dendrogram::Leaf { crate_name } => leaves.push(crate_name.clone()),
407 Dendrogram::Internal { left, right, .. } => {
408 collect_leaves(left, leaves);
409 collect_leaves(right, leaves);
410 }
411 }
412 }
413
414 collect_leaves(&dendrogram, &mut leaves);
415 leaves.sort();
416 assert_eq!(leaves, vec!["a", "b", "c", "d", "e"]);
417 } else {
418 panic!("Expected success with many crates low correlation.");
419 }
420 }
421
422 }
427