aprender-core 0.50.0

Next-generation machine learning library in pure Rust
// =========================================================================
// FALSIFY-HC: Agglomerative (hierarchical) clustering contract (aprender cluster)
//
// Five-Whys (PMAT-354):
//   Why 1: aprender had no inline FALSIFY-HC-* tests for AgglomerativeClustering
//   Why 2: agglomerative tests exist but lack contract-mapped FALSIFY naming
//   Why 3: no YAML contract for hierarchical clustering yet
//   Why 4: aprender predates the inline FALSIFY convention
//   Why 5: Hierarchical clustering was "obviously correct" (textbook algorithm)
//
// References:
//   - Ward (1963) "Hierarchical grouping to optimize an objective function"
// =========================================================================

use super::*;
use crate::primitives::Matrix;
use crate::traits::UnsupervisedEstimator;

/// FALSIFY-HC-001: Labels length matches sample count
#[test]
fn falsify_hc_001_labels_length() {
    let data = Matrix::from_vec(
        6,
        2,
        vec![
            0.0, 0.0, 0.1, 0.1, 0.2, 0.0, 10.0, 10.0, 10.1, 10.1, 10.0, 10.2,
        ],
    )
    .expect("valid matrix");

    let mut hc = AgglomerativeClustering::new(2, Linkage::Average);
    hc.fit(&data).expect("fit succeeds");

    let labels = hc.labels();
    assert_eq!(
        labels.len(),
        6,
        "FALSIFIED HC-001: labels len={}, expected 6",
        labels.len()
    );
}

/// FALSIFY-HC-002: Number of distinct labels equals n_clusters
#[test]
fn falsify_hc_002_n_clusters_correct() {
    let data = Matrix::from_vec(
        6,
        2,
        vec![
            0.0, 0.0, 0.1, 0.1, 0.2, 0.0, 10.0, 10.0, 10.1, 10.1, 10.0, 10.2,
        ],
    )
    .expect("valid matrix");

    let mut hc = AgglomerativeClustering::new(2, Linkage::Ward);
    hc.fit(&data).expect("fit succeeds");

    let labels = hc.labels();
    let mut unique: Vec<usize> = labels.clone();
    unique.sort_unstable();
    unique.dedup();

    assert_eq!(
        unique.len(),
        2,
        "FALSIFIED HC-002: unique labels={}, expected 2",
        unique.len()
    );
}

/// FALSIFY-HC-003: Two well-separated clusters get distinct labels
#[test]
fn falsify_hc_003_distinct_clusters() {
    let data = Matrix::from_vec(
        6,
        2,
        vec![
            0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 100.0, 100.0, 100.1, 100.0, 100.0, 100.1,
        ],
    )
    .expect("valid matrix");

    let mut hc = AgglomerativeClustering::new(2, Linkage::Single);
    hc.fit(&data).expect("fit succeeds");

    let labels = hc.labels();
    // Cluster A points should share a label
    assert_eq!(
        labels[0], labels[1],
        "FALSIFIED HC-003: cluster A inconsistent"
    );
    assert_eq!(
        labels[1], labels[2],
        "FALSIFIED HC-003: cluster A inconsistent"
    );
    // Different clusters
    assert_ne!(
        labels[0], labels[3],
        "FALSIFIED HC-003: clusters have same label={}",
        labels[0]
    );
}

/// FALSIFY-HC-004 (PMAT-849): Ward linkage merge distance matches scipy/sklearn.
///
/// The Lance-Williams Ward inter-cluster distance is
///   d(A, B) = sqrt(2 * |A| * |B| / (|A| + |B|)) * ||centroid(A) - centroid(B)||_2
/// A prior bug used `(|A|*|B|/(|A|+|B|)) * ||..||` (no factor of 2, no outer sqrt),
/// which under-penalizes merging into larger clusters and flips which pair merges.
///
/// Reference dataset (verified vs scipy.cluster.hierarchy.linkage(method='ward')
/// and sklearn.cluster.AgglomerativeClustering(linkage='ward')):
///   X = [[5.5,7.2],[6.0,5.4],[4.2,6.5],[4.4,8.9],[9.6,3.8],[7.9,5.3]], n_clusters=2
/// scipy partition: {1,4,5} together and {0,2,3} together.
/// The buggy coefficient produced {0,1,2,3} together and {4,5} alone instead.
#[test]
fn falsify_hc_004_ward_matches_scipy() {
    let data = Matrix::from_vec(
        6,
        2,
        vec![5.5, 7.2, 6.0, 5.4, 4.2, 6.5, 4.4, 8.9, 9.6, 3.8, 7.9, 5.3],
    )
    .expect("valid matrix");

    let mut hc = AgglomerativeClustering::new(2, Linkage::Ward);
    hc.fit(&data).expect("fit succeeds");

    let labels = hc.labels();

    // Labels are permutation-invariant, so compare the induced partition.
    // scipy/sklearn ward groups point 1 with the high-x points {4, 5},
    // and points {0, 2, 3} together. Point 1 must NOT be with point 0.
    assert_eq!(
        labels[1], labels[4],
        "FALSIFIED HC-004: point 1 and point 4 must share a cluster (scipy ward), got {} vs {}",
        labels[1], labels[4]
    );
    assert_eq!(
        labels[4], labels[5],
        "FALSIFIED HC-004: point 4 and point 5 must share a cluster (scipy ward), got {} vs {}",
        labels[4], labels[5]
    );
    assert_ne!(
        labels[1], labels[0],
        "FALSIFIED HC-004: point 1 must NOT be with point 0 (scipy ward partition is {{1,4,5}} | {{0,2,3}}), both got {}",
        labels[1]
    );
    // The {0,2,3} side must be internally consistent.
    assert_eq!(
        labels[0], labels[2],
        "FALSIFIED HC-004: point 0 and point 2 must share a cluster (scipy ward), got {} vs {}",
        labels[0], labels[2]
    );
    assert_eq!(
        labels[2], labels[3],
        "FALSIFIED HC-004: point 2 and point 3 must share a cluster (scipy ward), got {} vs {}",
        labels[2], labels[3]
    );
}

mod hc_proptest_falsify {
    use super::*;
    use proptest::prelude::*;

    // FALSIFY-HC-001-prop: Labels length matches sample count for random sizes
    proptest! {
        #![proptest_config(ProptestConfig::with_cases(15))]

        #[test]
        fn falsify_hc_001_prop_labels_length(
            n in 4..=15usize,
            seed in 0..500u32,
        ) {
            let data: Vec<f32> = (0..n * 2)
                .map(|i| ((i as f32 + seed as f32) * 0.37).sin() * 10.0)
                .collect();
            let matrix = Matrix::from_vec(n, 2, data).expect("valid");
            let mut hc = AgglomerativeClustering::new(2, Linkage::Average);
            hc.fit(&matrix).expect("fit");

            prop_assert_eq!(
                hc.labels().len(),
                n,
                "FALSIFIED HC-001-prop: labels len {} != {}",
                hc.labels().len(), n
            );
        }
    }

    // FALSIFY-HC-002-prop: Number of distinct labels equals n_clusters
    proptest! {
        #![proptest_config(ProptestConfig::with_cases(15))]

        #[test]
        fn falsify_hc_002_prop_n_clusters(
            n in 6..=15usize,
            seed in 0..500u32,
        ) {
            let data: Vec<f32> = (0..n * 2)
                .map(|i| ((i as f32 + seed as f32) * 0.37).sin() * 10.0)
                .collect();
            let matrix = Matrix::from_vec(n, 2, data).expect("valid");
            let mut hc = AgglomerativeClustering::new(2, Linkage::Ward);
            hc.fit(&matrix).expect("fit");

            let labels = hc.labels();
            let mut unique: Vec<usize> = labels.clone();
            unique.sort_unstable();
            unique.dedup();

            prop_assert_eq!(
                unique.len(),
                2,
                "FALSIFIED HC-002-prop: unique labels {} != 2",
                unique.len()
            );
        }
    }
}