scirs2_cluster/hierarchy/
dendrogram.rs

1//! Dendrogram visualization and analysis tools
2//!
3//! This module provides functions for analyzing and potentially visualizing dendrograms
4//! created by hierarchical clustering.
5
6use scirs2_core::ndarray::{Array1, Array2};
7use scirs2_core::numeric::{Float, FromPrimitive};
8use std::fmt::Debug;
9
10use crate::error::{ClusteringError, Result};
11use statrs::statistics::Statistics;
12
13/// Calculates the cophenetic correlation coefficient
14///
15/// This measures how faithfully the dendrogram preserves the original distances.
16///
17/// # Arguments
18///
19/// * `z` - The linkage matrix from the `linkage` function
20/// * `d` - The original distance matrix (condensed form)
21///
22/// # Returns
23///
24/// * `Result<F>` - The cophenetic correlation coefficient
25#[allow(dead_code)]
26pub fn cophenet<F: Float + FromPrimitive>(z: &Array2<F>, d: &Array1<F>) -> Result<F> {
27    let n_samples = z.shape()[0] + 1;
28
29    // Calculate the cophenetic distances
30    let mut cophenetic_distances = Array1::zeros(d.len());
31
32    // Create a mapping from sample indices to their dendrogram heights
33    let mut cluster_height: Vec<F> = vec![F::zero(); 2 * n_samples - 1];
34
35    // Process the linkage matrix
36    for i in 0..z.shape()[0] {
37        let _cluster1 = z[[i, 0]].to_usize().unwrap();
38        let _cluster2 = z[[i, 1]].to_usize().unwrap();
39        let height = z[[i, 2]];
40        let new_cluster = n_samples + i;
41
42        // Record the merge height
43        cluster_height[new_cluster] = height;
44    }
45
46    // Calculate cophenetic distances
47    let mut idx = 0;
48    for i in 0..n_samples {
49        for j in (i + 1)..n_samples {
50            // Find the lowest common ancestor in the dendrogram
51            let lca_height = find_lca_height(i, j, z, &cluster_height);
52            cophenetic_distances[idx] = lca_height;
53            idx += 1;
54        }
55    }
56
57    // Calculate the correlation coefficient
58    let d_mean = d.mean().unwrap();
59    let c_mean = cophenetic_distances.mean().unwrap();
60
61    let mut numerator = F::zero();
62    let mut d_variance = F::zero();
63    let mut c_variance = F::zero();
64
65    for i in 0..d.len() {
66        let d_diff = d[i] - d_mean;
67        let c_diff = cophenetic_distances[i] - c_mean;
68
69        numerator = numerator + d_diff * c_diff;
70        d_variance = d_variance + d_diff * d_diff;
71        c_variance = c_variance + c_diff * c_diff;
72    }
73
74    let denom = (d_variance * c_variance).sqrt();
75
76    if denom < F::from_f64(1e-10).unwrap() {
77        return Err(ClusteringError::ComputationError(
78            "Variance is too small to calculate cophenetic correlation".into(),
79        ));
80    }
81
82    Ok(numerator / denom)
83}
84
85/// Finds the height of the lowest common ancestor in the dendrogram
86#[allow(dead_code)]
87fn find_lca_height<F: Float>(i: usize, j: usize, z: &Array2<F>, clusterheight: &[F]) -> F {
88    let n_samples = z.shape()[0] + 1;
89
90    // These are intentionally prefixed with underscore to indicate they're not used
91    // in our simplified implementation, but kept for future development
92    let _i_cluster = i;
93    let _j_cluster = j;
94
95    // Trace up the dendrogram, tracking the membership of each leaf
96    let mut cluster_map = vec![0; 2 * n_samples - 1];
97    for (idx, val) in cluster_map.iter_mut().enumerate().take(n_samples) {
98        *val = idx;
99    }
100
101    for idx in 0..z.shape()[0] {
102        let cluster1 = z[[idx, 0]].to_usize().unwrap();
103        let cluster2 = z[[idx, 1]].to_usize().unwrap();
104        let new_cluster = n_samples + idx;
105
106        // Update cluster membership
107        for (k, val) in cluster_map.iter_mut().enumerate() {
108            if k < new_cluster && (*val == cluster1 || *val == cluster2) {
109                *val = new_cluster;
110            }
111        }
112
113        // Check if i and j are now in the same cluster
114        if cluster_map[i] == cluster_map[j] {
115            // Found the lowest common ancestor
116            return clusterheight[cluster_map[i]];
117        }
118    }
119
120    // If we get here, something went wrong
121    F::zero()
122}
123
124/// Calculates the inconsistency of each merge in the linkage matrix
125///
126/// Inconsistency is a measure of how consistent a merge is with respect to its neighbors.
127///
128/// # Arguments
129///
130/// * `z` - The linkage matrix from the `linkage` function
131/// * `d` - Number of levels to consider in calculating inconsistency (default: 2)
132///
133/// # Returns
134///
135/// * `Result<Array2<F>>` - The inconsistency matrix
136#[allow(dead_code)]
137pub fn inconsistent<F: Float + FromPrimitive + Debug>(
138    z: &Array2<F>,
139    d: Option<usize>,
140) -> Result<Array2<F>> {
141    let depth = d.unwrap_or(2);
142    let n = z.shape()[0];
143
144    // Output format: [mean, std, count, inconsistency]
145    let mut result = Array2::zeros((n, 4));
146
147    for i in 0..n {
148        // Get depths of descendants within specified depth
149        let depths = get_descendants(z, i, depth)?;
150
151        // Extract heights
152        let mut heights = Vec::with_capacity(depths.len());
153        for &idx in &depths {
154            if idx < n {
155                heights.push(z[[idx, 2]]);
156            }
157        }
158
159        if heights.is_empty() {
160            heights.push(z[[i, 2]]);
161        }
162
163        // Calculate statistics
164        let mean = heights.iter().fold(F::zero(), |acc, &x| acc + x)
165            / F::from_usize(heights.len()).unwrap();
166
167        let mut variance = F::zero();
168        for &h in &heights {
169            let diff = h - mean;
170            variance = variance + diff * diff;
171        }
172        variance = variance / F::from_usize(heights.len()).unwrap();
173        let std_dev = variance.sqrt();
174
175        // Calculate inconsistency
176        let inconsistency = if std_dev < F::from_f64(1e-10).unwrap() {
177            F::zero()
178        } else {
179            (z[[i, 2]] - mean) / std_dev
180        };
181
182        // Store results
183        result[[i, 0]] = mean;
184        result[[i, 1]] = std_dev;
185        result[[i, 2]] = F::from_usize(heights.len()).unwrap();
186        result[[i, 3]] = inconsistency;
187    }
188
189    Ok(result)
190}
191
192/// Gets the descendants of a node in the dendrogram within a specified depth
193#[allow(dead_code)]
194fn get_descendants<F: Float>(z: &Array2<F>, idx: usize, depth: usize) -> Result<Vec<usize>> {
195    let n_samples = z.shape()[0] + 1;
196    let mut result = Vec::new();
197
198    if depth == 0 {
199        result.push(idx);
200        return Ok(result);
201    }
202
203    // Non-leaf node
204    if idx >= n_samples - 1 {
205        let i = idx - (n_samples - 1); // Convert to z index
206
207        // Check if indices are valid
208        if i >= z.shape()[0] {
209            return Err(ClusteringError::ComputationError(
210                "Invalid node index in dendrogram".into(),
211            ));
212        }
213
214        let left = z[[i, 0]].to_usize().unwrap();
215        let right = z[[i, 1]].to_usize().unwrap();
216
217        // Add current node
218        result.push(i);
219
220        // Add descendants recursively
221        let left_desc = get_descendants(z, left, depth - 1)?;
222        let right_desc = get_descendants(z, right, depth - 1)?;
223
224        result.extend(left_desc);
225        result.extend(right_desc);
226    } else {
227        // Leaf node
228        result.push(idx);
229    }
230
231    Ok(result)
232}
233
234/// Calculates the optimal leaf ordering for a dendrogram
235///
236/// This reorders the leaves to minimize the sum of distances between adjacent leaves.
237/// Uses automatic algorithm selection: exact for small dendrograms, heuristic for large ones.
238///
239/// # Arguments
240///
241/// * `z` - The linkage matrix
242/// * `d` - The original distance matrix (condensed form)
243///
244/// # Returns
245///
246/// * `Result<Array1<usize>>` - The optimal leaf ordering
247#[allow(dead_code)]
248pub fn optimal_leaf_ordering<F: Float + FromPrimitive + PartialOrd + Debug>(
249    z: &Array2<F>,
250    d: &Array1<F>,
251) -> Result<Array1<usize>> {
252    // Use the new implementation from leaf_ordering module
253    crate::hierarchy::leaf_ordering::optimal_leaf_ordering(z.view(), d.view())
254}
255
256/// Converts a linkage matrix to a dendrogram dictionary for visualization
257///
258/// # Arguments
259///
260/// * `z` - The linkage matrix
261///
262/// # Returns
263///
264/// * `Result<Vec<(usize, usize, F, usize)>>` - The dendrogram data
265#[allow(dead_code)]
266pub fn dendrogram<F: Float + FromPrimitive + Clone>(
267    z: &Array2<F>,
268) -> Result<Vec<(usize, usize, F, usize)>> {
269    let n = z.shape()[0];
270
271    if n == 0 {
272        return Err(ClusteringError::InvalidInput("Empty linkage matrix".into()));
273    }
274
275    // Convert the linkage matrix to a list of tuples
276    let mut result = Vec::with_capacity(n);
277
278    for i in 0..n {
279        let cluster1 = z[[i, 0]].to_usize().unwrap();
280        let cluster2 = z[[i, 1]].to_usize().unwrap();
281        let height = z[[i, 2]];
282        let count = z[[i, 3]].to_usize().unwrap();
283
284        result.push((cluster1, cluster2, height, count));
285    }
286
287    Ok(result)
288}
289
290#[cfg(test)]
291mod tests {
292    use super::*;
293    use crate::hierarchy::{linkage, LinkageMethod, Metric};
294    use scirs2_core::ndarray::{Array1, Array2};
295
296    #[test]
297    fn test_cophenet_simple() {
298        // Create simple test data with clear hierarchical structure
299        let data = Array2::from_shape_vec(
300            (4, 2),
301            vec![
302                0.0, 0.0, // Point 0
303                1.0, 0.0, // Point 1 (close to 0)
304                10.0, 0.0, // Point 2 (far from 0,1)
305                11.0, 0.0, // Point 3 (close to 2)
306            ],
307        )
308        .unwrap();
309
310        // Compute linkage matrix
311        let linkage_matrix =
312            linkage(data.view(), LinkageMethod::Single, Metric::Euclidean).unwrap();
313
314        // Compute original distances (condensed form)
315        let mut original_distances = Array1::zeros(6); // C(4,2) = 6 pairwise distances
316        let mut idx = 0;
317        for i in 0..4 {
318            for j in (i + 1)..4 {
319                let dist = ((data[[i, 0]] - data[[j, 0]]).powi(2)
320                    + (data[[i, 1]] - data[[j, 1]]).powi(2))
321                .sqrt();
322                original_distances[idx] = dist;
323                idx += 1;
324            }
325        }
326
327        // Compute cophenetic correlation
328        let correlation = cophenet(&linkage_matrix, &original_distances).unwrap();
329
330        // For a well-structured hierarchical dataset, correlation should be high
331        assert!(
332            correlation >= 0.5,
333            "Cophenetic correlation should be reasonably high for structured data, got {}",
334            correlation
335        );
336        assert!(
337            correlation <= 1.0,
338            "Cophenetic correlation cannot exceed 1.0, got {}",
339            correlation
340        );
341    }
342
343    #[test]
344    fn test_cophenet_perfect_hierarchy() {
345        // Create data with perfect hierarchical structure
346        // Two well-separated clusters with internal hierarchy
347        let data = Array2::from_shape_vec(
348            (4, 1),
349            vec![
350                0.0,  // Cluster 1, point A
351                1.0,  // Cluster 1, point B
352                10.0, // Cluster 2, point C
353                11.0, // Cluster 2, point D
354            ],
355        )
356        .unwrap();
357
358        let linkage_matrix =
359            linkage(data.view(), LinkageMethod::Single, Metric::Euclidean).unwrap();
360
361        // Compute original distances
362        let original_distances = Array1::from_vec(vec![
363            1.0,  // 0-1
364            10.0, // 0-2
365            11.0, // 0-3
366            9.0,  // 1-2
367            10.0, // 1-3
368            1.0,  // 2-3
369        ]);
370
371        let correlation = cophenet(&linkage_matrix, &original_distances).unwrap();
372
373        // Should have very high correlation for this structured data
374        assert!(
375            correlation >= 0.8,
376            "Perfect hierarchy should have high cophenetic correlation, got {}",
377            correlation
378        );
379    }
380
381    #[test]
382    fn test_cophenet_identical_points() {
383        // Edge case: some identical points
384        let data = Array2::from_shape_vec(
385            (3, 2),
386            vec![
387                0.0, 0.0, 0.0, 0.0, // Identical to first point
388                5.0, 5.0,
389            ],
390        )
391        .unwrap();
392
393        let linkage_matrix =
394            linkage(data.view(), LinkageMethod::Single, Metric::Euclidean).unwrap();
395
396        let original_distances = Array1::from_vec(vec![
397            0.0,                    // 0-1 (identical)
398            (5.0_f64 * 2.0).sqrt(), // 0-2
399            (5.0_f64 * 2.0).sqrt(), // 1-2
400        ]);
401
402        // Should not panic and return a correlation result
403        let result = cophenet(&linkage_matrix, &original_distances);
404        assert!(
405            result.is_ok(),
406            "Cophenetic correlation should handle identical points"
407        );
408
409        let correlation = result.unwrap();
410        // For identical points, correlation might be NaN, infinity, or a valid number
411        // We just check that it doesn't panic and is some kind of number
412        assert!(
413            correlation.is_finite() || correlation.is_nan() || correlation.is_infinite(),
414            "Correlation should be a valid floating point number, got {}",
415            correlation
416        );
417    }
418
419    #[test]
420    fn test_inconsistent_basic() {
421        // Test inconsistency calculation
422        let data = Array2::from_shape_vec(
423            (5, 2),
424            vec![0.0, 0.0, 1.0, 0.0, 2.0, 0.0, 10.0, 0.0, 11.0, 0.0],
425        )
426        .unwrap();
427
428        let linkage_matrix =
429            linkage(data.view(), LinkageMethod::Average, Metric::Euclidean).unwrap();
430
431        // Test with default depth
432        let inconsistency_matrix = inconsistent(&linkage_matrix, None).unwrap();
433
434        // Should have same number of rows as linkage matrix
435        assert_eq!(inconsistency_matrix.shape()[0], linkage_matrix.shape()[0]);
436        assert_eq!(inconsistency_matrix.shape()[1], 4); // [mean, std, count, inconsistency]
437
438        // All values should be finite
439        for i in 0..inconsistency_matrix.shape()[0] {
440            for j in 0..inconsistency_matrix.shape()[1] {
441                assert!(
442                    inconsistency_matrix[[i, j]].is_finite(),
443                    "Inconsistency values should be finite at [{}, {}]",
444                    i,
445                    j
446                );
447            }
448        }
449    }
450
451    #[test]
452    fn test_inconsistent_with_depth() {
453        let data = Array2::from_shape_vec((4, 1), vec![0.0, 1.0, 2.0, 10.0]).unwrap();
454
455        let linkage_matrix =
456            linkage(data.view(), LinkageMethod::Complete, Metric::Euclidean).unwrap();
457
458        // Test with different depths
459        for depth in 1..=3 {
460            let inconsistency_matrix = inconsistent(&linkage_matrix, Some(depth)).unwrap();
461
462            assert_eq!(inconsistency_matrix.shape()[0], linkage_matrix.shape()[0]);
463            assert_eq!(inconsistency_matrix.shape()[1], 4);
464
465            // Count values should be positive
466            for i in 0..inconsistency_matrix.shape()[0] {
467                assert!(
468                    inconsistency_matrix[[i, 2]] > 0.0,
469                    "Count should be positive"
470                );
471            }
472        }
473    }
474
475    #[test]
476    fn test_optimal_leaf_ordering() {
477        let data = Array2::from_shape_vec((4, 2), vec![1.0, 1.0, 2.0, 2.0, 10.0, 10.0, 11.0, 11.0])
478            .unwrap();
479
480        let linkage_matrix = linkage(data.view(), LinkageMethod::Ward, Metric::Euclidean).unwrap();
481        let original_distances = Array1::from_vec(vec![1.414, 12.73, 14.14, 1.414, 12.73, 1.414]);
482
483        let ordering = optimal_leaf_ordering(&linkage_matrix, &original_distances).unwrap();
484
485        // Should return ordering for all samples
486        assert_eq!(ordering.len(), 4);
487
488        // All indices should be unique and in range
489        let mut indices: Vec<usize> = ordering.to_vec();
490        indices.sort();
491        assert_eq!(indices, vec![0, 1, 2, 3]);
492    }
493
494    #[test]
495    fn test_dendrogram_conversion() {
496        let data = Array2::from_shape_vec((3, 1), vec![0.0, 5.0, 10.0]).unwrap();
497
498        let linkage_matrix =
499            linkage(data.view(), LinkageMethod::Single, Metric::Euclidean).unwrap();
500
501        let dendrogram_data = dendrogram(&linkage_matrix).unwrap();
502
503        // Should have n-1 entries for n samples
504        assert_eq!(dendrogram_data.len(), 2);
505
506        // Each entry should have the correct format: (cluster1, cluster2, height, count)
507        for (i, &(cluster1, cluster2, height, count)) in dendrogram_data.iter().enumerate() {
508            assert!(
509                cluster1 < 2 * data.shape()[0] - 1,
510                "Invalid cluster1 index in merge {}",
511                i
512            );
513            assert!(
514                cluster2 < 2 * data.shape()[0] - 1,
515                "Invalid cluster2 index in merge {}",
516                i
517            );
518            assert!(height >= 0.0, "Merge height should be non-negative");
519            assert!(count >= 2, "Cluster count should be at least 2");
520        }
521    }
522
523    #[test]
524    fn test_cophenet_error_cases() {
525        // Create valid test data first
526        let data = Array2::from_shape_vec((3, 1), vec![0.0, 1.0, 2.0]).unwrap();
527        let linkage_matrix =
528            linkage(data.view(), LinkageMethod::Single, Metric::Euclidean).unwrap();
529
530        // Test with zero variance (all distances identical)
531        let identical_distances = Array1::from_vec(vec![1.0, 1.0, 1.0]);
532        let result = cophenet(&linkage_matrix, &identical_distances);
533
534        // Should handle edge cases gracefully
535        if let Err(e) = result {
536            // If it returns an error, it should be a meaningful one
537            assert!(
538                format!("{}", e).contains("variance") || format!("{}", e).contains("correlation")
539            );
540        } else {
541            // If it succeeds, the result should be valid
542            let correlation = result.unwrap();
543            assert!(correlation.is_finite(), "Correlation should be finite");
544        }
545    }
546
547    #[test]
548    fn test_find_lca_height() {
549        // Test the internal LCA function indirectly through cophenet
550        let data = Array2::from_shape_vec((3, 1), vec![0.0, 1.0, 10.0]).unwrap();
551
552        let linkage_matrix =
553            linkage(data.view(), LinkageMethod::Single, Metric::Euclidean).unwrap();
554        let original_distances = Array1::from_vec(vec![1.0, 10.0, 9.0]);
555
556        // This should work without errors - testing the internal LCA logic
557        let correlation = cophenet(&linkage_matrix, &original_distances).unwrap();
558        assert!(
559            correlation.is_finite(),
560            "LCA height calculation should produce finite correlation"
561        );
562    }
563}