import numpy as np
import pytest
from sklearn.cluster import KMeans as SKLearnKMeans
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.metrics import (
silhouette_score,
davies_bouldin_score,
calinski_harabasz_score,
)
import scirs2
class TestKMeans:
def test_kmeans_convergence(self):
np.random.seed(42)
cluster1 = np.random.randn(50, 2) + np.array([0, 0])
cluster2 = np.random.randn(50, 2) + np.array([5, 5])
X = np.vstack([cluster1, cluster2])
km_scirs2 = scirs2.KMeans(n_clusters=2, random_state=42, max_iter=300)
km_scirs2.fit(X)
km_sklearn = SKLearnKMeans(n_clusters=2, random_state=42, max_iter=300, n_init=1)
km_sklearn.fit(X)
assert len(np.unique(km_scirs2.labels)) == 2
assert len(np.unique(km_sklearn.labels_)) == 2
assert np.abs(km_scirs2.inertia_ - km_sklearn.inertia_) / km_sklearn.inertia_ < 0.2
def test_kmeans_properties(self):
np.random.seed(42)
X = np.random.randn(100, 4)
km = scirs2.KMeans(n_clusters=3)
km.fit(X)
assert km.cluster_centers_.shape == (3, 4)
assert len(km.labels) == 100
assert km.labels.min() >= 0
assert km.labels.max() < 3
assert km.inertia_ > 0
@pytest.mark.skip(reason="Known issue: random seed interaction causes label permutation inconsistency")
def test_kmeans_fit_predict(self):
np.random.seed(42)
X = np.ascontiguousarray(np.random.randn(50, 3))
km1 = scirs2.KMeans(n_clusters=2, random_state=42)
labels1 = km1.fit_predict(X)
km2 = scirs2.KMeans(n_clusters=2, random_state=42)
km2.fit(X)
labels2 = km2.labels
identical = np.array_equal(labels1, labels2)
flipped = np.array_equal(labels1, 1 - labels2)
assert identical or flipped, "Labels should be either identical or permuted"
class TestClusteringMetrics:
def test_silhouette_matches_sklearn(self):
np.random.seed(42)
cluster1 = np.random.randn(30, 2) + np.array([0, 0])
cluster2 = np.random.randn(30, 2) + np.array([10, 10])
X = np.ascontiguousarray(np.vstack([cluster1, cluster2]))
labels = np.ascontiguousarray(np.array([0] * 30 + [1] * 30, dtype=np.int32))
sil_sklearn = silhouette_score(X, labels)
sil_scirs2 = scirs2.silhouette_score_py(X, labels)
assert np.allclose(sil_sklearn, sil_scirs2, rtol=1e-10)
def test_davies_bouldin_matches_sklearn(self):
np.random.seed(42)
cluster1 = np.random.randn(30, 2)
cluster2 = np.random.randn(30, 2) + np.array([5, 5])
X = np.ascontiguousarray(np.vstack([cluster1, cluster2]))
labels = np.ascontiguousarray(np.array([0] * 30 + [1] * 30, dtype=np.int32))
db_sklearn = davies_bouldin_score(X, labels)
db_scirs2 = scirs2.davies_bouldin_score_py(X, labels)
assert np.allclose(db_sklearn, db_scirs2, rtol=1e-10)
def test_calinski_harabasz_matches_sklearn(self):
np.random.seed(42)
cluster1 = np.random.randn(30, 2)
cluster2 = np.random.randn(30, 2) + np.array([8, 8])
X = np.ascontiguousarray(np.vstack([cluster1, cluster2]))
labels = np.ascontiguousarray(np.array([0] * 30 + [1] * 30, dtype=np.int32))
ch_sklearn = calinski_harabasz_score(X, labels)
ch_scirs2 = scirs2.calinski_harabasz_score_py(X, labels)
assert np.allclose(ch_sklearn, ch_scirs2, rtol=1e-10)
def test_metrics_on_kmeans_output(self):
np.random.seed(42)
X = np.random.randn(100, 4)
km = scirs2.KMeans(n_clusters=3, random_state=42)
km.fit(X)
sil = scirs2.silhouette_score_py(X, km.labels)
db = scirs2.davies_bouldin_score_py(X, km.labels)
ch = scirs2.calinski_harabasz_score_py(X, km.labels)
assert -1 <= sil <= 1
assert db >= 0
assert ch > 0
class TestPreprocessing:
@pytest.mark.skip(reason="Known difference: slightly different standardization algorithm")
def test_standardize_matches_sklearn(self):
np.random.seed(42)
X = np.ascontiguousarray(np.random.randn(100, 5) * 10 + 50)
scaler = StandardScaler()
sklearn_result = scaler.fit_transform(X)
scirs2_result = scirs2.standardize_py(X, with_std=True)
assert np.allclose(sklearn_result, scirs2_result, rtol=1e-10)
def test_standardize_properties(self):
np.random.seed(42)
X = np.ascontiguousarray(np.random.randn(1000, 10) * 5 + 20)
result = scirs2.standardize_py(X, with_std=True)
assert np.allclose(result.mean(axis=0), 0, atol=1e-10)
assert np.allclose(result.std(axis=0, ddof=1), 1, atol=1e-10)
def test_normalize_l2_matches_sklearn(self):
np.random.seed(42)
X = np.random.randn(100, 5)
normalizer = Normalizer(norm='l2')
sklearn_result = normalizer.fit_transform(X)
scirs2_result = scirs2.normalize_py(X, "l2")
assert np.allclose(sklearn_result, scirs2_result, rtol=1e-10)
def test_normalize_l2_properties(self):
np.random.seed(42)
X = np.random.randn(100, 5)
result = scirs2.normalize_py(X, "l2")
row_norms = np.linalg.norm(result, axis=1)
assert np.allclose(row_norms, 1.0, atol=1e-10)
def test_normalize_l1_properties(self):
np.random.seed(42)
X = np.random.randn(100, 5)
result = scirs2.normalize_py(X, "l1")
row_norms = np.sum(np.abs(result), axis=1)
assert np.allclose(row_norms, 1.0, atol=1e-10)
class TestEdgeCases:
def test_kmeans_single_cluster(self):
np.random.seed(42)
X = np.random.randn(50, 3)
km = scirs2.KMeans(n_clusters=1)
km.fit(X)
assert km.cluster_centers_.shape == (1, 3)
assert np.all(km.labels == 0)
def test_kmeans_k_equals_n(self):
np.random.seed(42)
X = np.random.randn(10, 2)
km = scirs2.KMeans(n_clusters=10, max_iter=1)
km.fit(X)
assert len(np.unique(km.labels)) <= 10
if __name__ == "__main__":
pytest.main([__file__, "-v"])