use scirs2_core::ndarray::{s, Array1, Array2, ArrayView2, ScalarOperand};
use scirs2_core::numeric::{Float, FromPrimitive};
use scirs2_linalg::eigh;
use std::fmt::Debug;
use crate::error::{ClusteringError, Result};
use crate::vq::{kmeans_with_options, KMeansInit, KMeansOptions};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AffinityMode {
NearestNeighbors,
RBF,
Precomputed,
}
#[allow(dead_code)]
fn eigengap_heuristic<F>(eigenvalues: &[F], max_clusters: usize) -> usize
where
F: Float + FromPrimitive + Debug + PartialOrd,
{
let n = eigenvalues.len();
let mut max_gap = F::zero();
let mut max_gap_idx = 1;
for i in 0..(max_clusters.min(n - 1)) {
let gap = eigenvalues[i + 1] - eigenvalues[i];
if gap > max_gap {
max_gap = gap;
max_gap_idx = i + 1;
}
}
max_gap_idx
}
#[allow(dead_code)]
fn normalized_laplacian<F>(affinity: &Array2<F>) -> Result<Array2<F>>
where
F: Float + FromPrimitive + Debug + PartialOrd,
{
let n = affinity.shape()[0];
if n != affinity.shape()[1] {
return Err(ClusteringError::InvalidInput(
"Affinity matrix must be square".to_string(),
));
}
let mut degrees = Array1::zeros(n);
for i in 0..n {
degrees[i] = affinity.row(i).sum();
}
let mut d_inv_sqrt = Array1::zeros(n);
for i in 0..n {
if degrees[i] <= F::epsilon() {
return Err(ClusteringError::ComputationError(
"Degree matrix contains zero values, graph may be disconnected".to_string(),
));
}
d_inv_sqrt[i] = F::one() / degrees[i].sqrt();
}
let mut laplacian = Array2::zeros((n, n));
for i in 0..n {
for j in 0..n {
if i == j {
laplacian[[i, j]] = F::one() - affinity[[i, j]] * d_inv_sqrt[i] * d_inv_sqrt[j];
} else {
laplacian[[i, j]] = -affinity[[i, j]] * d_inv_sqrt[i] * d_inv_sqrt[j];
}
}
}
Ok(laplacian)
}
#[allow(dead_code)]
fn knn_affinity<F>(data: ArrayView2<F>, n_neighbors: usize) -> Result<Array2<F>>
where
F: Float + FromPrimitive + Debug + PartialOrd,
{
let n_samples = data.shape()[0];
let n_features = data.shape()[1];
if n_neighbors >= n_samples {
return Err(ClusteringError::InvalidInput(format!(
"n_neighbors ({}) must be less than the number of samples ({})",
n_neighbors, n_samples
)));
}
let mut dist_matrix = Array2::zeros((n_samples, n_samples));
for i in 0..n_samples {
for j in (i + 1)..n_samples {
let mut dist_sq = F::zero();
for k in 0..n_features {
let diff = data[[i, k]] - data[[j, k]];
dist_sq = dist_sq + diff * diff;
}
let dist = dist_sq.sqrt();
dist_matrix[[i, j]] = dist;
dist_matrix[[j, i]] = dist; }
}
let mut affinity = Array2::zeros((n_samples, n_samples));
for i in 0..n_samples {
let mut distances: Vec<(usize, F)> = (0..n_samples)
.filter(|&j| i != j) .map(|j| (j, dist_matrix[[i, j]]))
.collect();
distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
for (j, _) in distances.iter().take(n_neighbors.min(distances.len())) {
affinity[[i, *j]] = F::one();
affinity[[*j, i]] = F::one();
}
}
Ok(affinity)
}
#[allow(dead_code)]
fn rbf_affinity<F>(data: ArrayView2<F>, gamma: F) -> Result<Array2<F>>
where
F: Float + FromPrimitive + Debug + PartialOrd,
{
let n_samples = data.shape()[0];
let n_features = data.shape()[1];
if gamma <= F::zero() {
return Err(ClusteringError::InvalidInput(
"gamma must be positive".to_string(),
));
}
let mut affinity = Array2::zeros((n_samples, n_samples));
for i in 0..n_samples {
affinity[[i, i]] = F::one();
for j in (i + 1)..n_samples {
let mut dist_sq = F::zero();
for k in 0..n_features {
let diff = data[[i, k]] - data[[j, k]];
dist_sq = dist_sq + diff * diff;
}
let affinity_val = (-gamma * dist_sq).exp();
affinity[[i, j]] = affinity_val;
affinity[[j, i]] = affinity_val; }
}
Ok(affinity)
}
#[derive(Debug, Clone)]
pub struct SpectralClusteringOptions<F: Float> {
pub affinity: AffinityMode,
pub n_neighbors: usize,
pub gamma: F,
pub normalized_laplacian: bool,
pub max_iter: usize,
pub n_init: usize,
pub tol: F,
pub random_seed: Option<u64>,
pub eigen_solver: String,
pub auto_n_clusters: bool,
}
impl<F: Float + FromPrimitive> Default for SpectralClusteringOptions<F> {
fn default() -> Self {
Self {
affinity: AffinityMode::RBF,
n_neighbors: 10,
gamma: F::from(1.0).expect("Failed to convert constant to float"),
normalized_laplacian: true,
max_iter: 300,
n_init: 10,
tol: F::from(1e-4).expect("Failed to convert constant to float"),
random_seed: None,
eigen_solver: "arpack".to_string(),
auto_n_clusters: false,
}
}
}
#[allow(dead_code)]
pub fn spectral_clustering<F>(
data: ArrayView2<F>,
n_clusters: usize,
options: Option<SpectralClusteringOptions<F>>,
) -> Result<(Array2<F>, Array1<usize>)>
where
F: Float
+ FromPrimitive
+ Debug
+ PartialOrd
+ ScalarOperand
+ 'static
+ std::iter::Sum
+ std::ops::AddAssign
+ std::ops::SubAssign
+ std::ops::MulAssign
+ std::ops::DivAssign
+ std::ops::RemAssign
+ std::fmt::Display
+ Send
+ Sync,
{
let opts = options.unwrap_or_default();
let n_samples = data.shape()[0];
scirs2_core::validation::clustering::validate_clustering_data(
&data,
"Spectral clustering",
false,
Some(2),
)
.map_err(|e| ClusteringError::InvalidInput(format!("Spectral clustering: {}", e)))?;
if n_clusters < 2 {
return Err(ClusteringError::InvalidInput(format!(
"Spectral clustering: number of _clusters must be >= 2, got {}",
n_clusters
)));
}
scirs2_core::validation::clustering::check_n_clusters_bounds(
&data,
n_clusters,
"Spectral clustering",
)
.map_err(|e| ClusteringError::InvalidInput(format!("{}", e)))?;
let affinity = match opts.affinity {
AffinityMode::NearestNeighbors => {
if data.shape()[0] == data.shape()[1] {
data.to_owned()
} else {
knn_affinity(data, opts.n_neighbors)?
}
}
AffinityMode::RBF => {
if data.shape()[0] == data.shape()[1] {
data.to_owned()
} else {
rbf_affinity(data, opts.gamma)?
}
}
AffinityMode::Precomputed => {
if data.shape()[0] != data.shape()[1] {
return Err(ClusteringError::InvalidInput(
"For precomputed affinity, data must be a square matrix".to_string(),
));
}
data.to_owned()
}
};
let laplacian = if opts.normalized_laplacian {
normalized_laplacian(&affinity)?
} else {
let mut lap = Array2::zeros((n_samples, n_samples));
let mut degrees = vec![F::zero(); n_samples];
for i in 0..n_samples {
degrees[i] = affinity.row(i).sum();
lap[[i, i]] = degrees[i];
}
for i in 0..n_samples {
for j in 0..n_samples {
lap[[i, j]] -= affinity[[i, j]];
}
}
lap
};
let n = laplacian.nrows();
let mut stabilized_laplacian = laplacian.clone();
for i in 0..n {
stabilized_laplacian[[i, i]] +=
F::from(1e-10).expect("Failed to convert constant to float");
}
let (eigenvalues, eigenvectors) = {
let (raw_vals, raw_vecs) = eigh(&stabilized_laplacian.view(), None)?;
let mut idx: Vec<usize> = (0..n).collect();
idx.sort_by(|&a, &b| {
raw_vals[a]
.partial_cmp(&raw_vals[b])
.unwrap_or(std::cmp::Ordering::Equal)
});
let mut sorted_vals = raw_vals.clone();
let mut sorted_vecs = raw_vecs.clone();
for (new, &old) in idx.iter().enumerate() {
sorted_vals[new] = raw_vals[old];
for row in 0..n {
sorted_vecs[[row, new]] = raw_vecs[[row, old]];
}
}
(sorted_vals, sorted_vecs)
};
let actual_n_clusters = if opts.auto_n_clusters {
eigengap_heuristic(&eigenvalues.to_vec(), n_clusters)
} else {
n_clusters
};
let embedding = if opts.normalized_laplacian {
eigenvectors.slice(s![.., ..actual_n_clusters]).to_owned()
} else {
eigenvectors
.slice(s![.., 1..(actual_n_clusters + 1)])
.to_owned()
};
let normalized_embedding = if opts.normalized_laplacian {
let mut norm_embedding = embedding.clone();
for i in 0..n_samples {
let row = embedding.row(i);
let norm: F = row.iter().map(|&x| x * x).sum::<F>().sqrt();
if norm > F::epsilon() {
for j in 0..actual_n_clusters {
norm_embedding[[i, j]] = embedding[[i, j]] / norm;
}
}
}
norm_embedding
} else {
embedding
};
let kmeans_opts = KMeansOptions {
max_iter: opts.max_iter,
tol: opts.tol,
random_seed: opts.random_seed,
n_init: opts.n_init,
init_method: KMeansInit::KMeansPlusPlus,
};
let (_, labels) = kmeans_with_options(
normalized_embedding.view(),
actual_n_clusters,
Some(kmeans_opts),
)?;
Ok((normalized_embedding, labels))
}
#[allow(dead_code)]
pub fn spectral_bipartition<F>(
data: ArrayView2<F>,
options: Option<SpectralClusteringOptions<F>>,
) -> Result<Array1<usize>>
where
F: Float
+ FromPrimitive
+ Debug
+ PartialOrd
+ ScalarOperand
+ 'static
+ std::iter::Sum
+ std::ops::AddAssign
+ std::ops::SubAssign
+ std::ops::MulAssign
+ std::ops::DivAssign
+ std::ops::RemAssign
+ std::fmt::Display
+ Send
+ Sync,
{
let (_, labels) = spectral_clustering(data, 2, options)?;
Ok(labels)
}
#[cfg(test)]
mod tests {
use super::*;
use scirs2_core::ndarray::Array2;
#[test]
fn test_spectral_clustering_basic() {
let data = Array2::from_shape_vec(
(6, 2),
vec![
1.0, 1.0, 1.1, 1.1, 0.9, 0.9, 5.0, 5.0, 5.1, 5.1, 4.9, 4.9,
],
)
.expect("Operation failed");
let options = SpectralClusteringOptions {
affinity: AffinityMode::RBF,
gamma: 0.1, normalized_laplacian: false, ..Default::default()
};
let result = spectral_clustering(data.view(), 2, Some(options));
assert!(
result.is_ok(),
"Spectral clustering should not fail: {:?}",
result.err()
);
let (embeddings, labels) = result.expect("Operation failed");
assert_eq!(embeddings.shape()[0], 6);
assert_eq!(labels.len(), 6);
let unique_labels: std::collections::HashSet<_> = labels.iter().cloned().collect();
assert!(
unique_labels.len() <= 2,
"Should have at most 2 clusters, got: {:?}",
unique_labels
);
assert!(
labels.iter().all(|&l| l == 0 || l == 1),
"All labels should be 0 or 1, got: {:?}",
labels
);
}
#[test]
fn test_spectral_clustering_ring() {
let data = Array2::from_shape_vec(
(16, 2),
vec![
1.0, 0.0, 0.7, 0.7, 0.0, 1.0, -0.7, 0.7, -1.0, 0.0, -0.7, -0.7, 0.0, -1.0, 0.7,
-0.7, 3.0, 0.0, 2.1, 2.1, 0.0, 3.0, -2.1, 2.1, -3.0, 0.0, -2.1, -2.1, 0.0, -3.0, 2.1,
-2.1,
],
)
.expect("Operation failed");
let options = SpectralClusteringOptions {
affinity: AffinityMode::RBF,
gamma: 0.05, n_init: 5, normalized_laplacian: false, ..Default::default()
};
let result = spectral_clustering(data.view(), 2, Some(options));
assert!(
result.is_ok(),
"Spectral clustering should not fail: {:?}",
result.err()
);
let (_, labels) = result.expect("Operation failed");
let unique_labels: std::collections::HashSet<_> = labels.iter().cloned().collect();
assert!(
unique_labels.len() <= 2,
"Should have at most 2 clusters, got: {:?}",
unique_labels
);
let first_label = labels[0];
let all_same = labels.iter().all(|&l| l == first_label);
assert!(!all_same, "All points should not be in the same cluster");
assert!(
labels.iter().all(|&l| l == 0 || l == 1),
"All labels should be 0 or 1"
);
}
#[test]
fn test_eigh_5x5_known_eigenvalues() {
let n = 5usize;
let v: f64 = 1.0 / (n as f64).sqrt();
let mut a = Array2::<f64>::zeros((n, n));
for i in 0..n {
for j in 0..n {
a[[i, j]] = v * v;
}
}
let eps = 1e-10_f64;
for i in 0..n {
a[[i, i]] += eps;
}
let (raw_vals, raw_vecs) =
eigh(&a.view(), None).expect("eigh must succeed on a valid SPSD matrix");
let mut idx: Vec<usize> = (0..n).collect();
idx.sort_by(|&i, &j| raw_vals[i].partial_cmp(&raw_vals[j]).unwrap());
let mut sorted_vals = raw_vals.clone();
let mut sorted_vecs = raw_vecs.clone();
for (new, &old) in idx.iter().enumerate() {
sorted_vals[new] = raw_vals[old];
for row in 0..n {
sorted_vecs[[row, new]] = raw_vecs[[row, old]];
}
}
for i in 0..4 {
assert!(
(sorted_vals[i] - eps).abs() < 1e-8,
"eigenvalue[{}] = {} should be ≈ {eps}",
i,
sorted_vals[i]
);
}
let expected_large = 1.0 + eps;
assert!(
(sorted_vals[4] - expected_large).abs() < 1e-8,
"eigenvalue[4] = {} should be ≈ {expected_large}",
sorted_vals[4]
);
for i in 0..n - 1 {
assert!(
sorted_vals[i] <= sorted_vals[i + 1] + 1e-14,
"eigenvalues not ascending at index {i}: {} > {}",
sorted_vals[i],
sorted_vals[i + 1]
);
}
}
#[test]
fn test_eigh_large_matrix_n_gt4_path() {
let data = Array2::<f64>::from_shape_vec(
(8, 2),
vec![
0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.1, 0.1, 50.0, 50.0, 50.1, 50.0, 50.0, 50.1, 50.1, 50.1, ],
)
.expect("shape is valid");
let opts = SpectralClusteringOptions {
affinity: AffinityMode::RBF,
gamma: 0.002,
normalized_laplacian: false,
n_init: 5,
random_seed: Some(42),
..Default::default()
};
let result = spectral_clustering(data.view(), 2, Some(opts));
assert!(
result.is_ok(),
"spectral_clustering on 8-point 2-D data must not fail: {:?}",
result.err()
);
let (embedding, labels) = result.expect("already checked is_ok");
assert_eq!(
embedding.shape(),
&[8, 2],
"embedding must be (8, 2), got {:?}",
embedding.shape()
);
assert_eq!(labels.len(), 8, "must have a label per sample");
assert!(
labels.iter().all(|&l| l < 2),
"all labels must be 0 or 1, got {:?}",
labels
);
let n0 = labels.iter().filter(|&&l| l == 0).count();
let n1 = labels.iter().filter(|&&l| l == 1).count();
assert!(
n0 > 0 && n1 > 0,
"both clusters must be non-empty, got n0={n0} n1={n1}"
);
let emb0: Vec<f64> = labels
.iter()
.enumerate()
.filter(|(_, &l)| l == 0)
.map(|(i, _)| embedding[[i, 0]])
.collect();
let emb1: Vec<f64> = labels
.iter()
.enumerate()
.filter(|(_, &l)| l == 1)
.map(|(i, _)| embedding[[i, 0]])
.collect();
let mean0 = emb0.iter().sum::<f64>() / emb0.len() as f64;
let mean1 = emb1.iter().sum::<f64>() / emb1.len() as f64;
let var0: f64 = emb0.iter().map(|x| (x - mean0).powi(2)).sum::<f64>() / emb0.len() as f64;
let var1: f64 = emb1.iter().map(|x| (x - mean1).powi(2)).sum::<f64>() / emb1.len() as f64;
let centroid_dist = (mean0 - mean1).abs();
let within_spread = (var0 + var1).sqrt();
assert!(
centroid_dist > within_spread,
"spectral embedding must separate groups: centroid_dist={centroid_dist:.4} must exceed within_spread={within_spread:.4}"
);
}
}