scirs2_cluster/spectral/mod.rs
1//! Spectral clustering implementation
2//!
3//! Spectral clustering uses the eigenvalues of a similarity matrix to reduce the
4//! dimensionality before clustering in fewer dimensions. This method is particularly
5//! useful when the clusters have complex shapes and KMeans would perform poorly.
6
7use scirs2_core::ndarray::{s, Array1, Array2, ArrayView2, ScalarOperand};
8use scirs2_core::numeric::{Float, FromPrimitive};
9use scirs2_linalg::{eigh, smallest_k_eigh};
10use std::fmt::Debug;
11
12use crate::error::{ClusteringError, Result};
13use crate::vq::{kmeans_with_options, KMeansInit, KMeansOptions};
14// use scirs2_core::validation::clustering::*;
15
16/// Affinity matrix construction methods
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub enum AffinityMode {
19 /// Nearest neighbors connectivity
20 NearestNeighbors,
21 /// Gaussian similarity (RBF kernel)
22 RBF,
23 /// Precomputed affinity matrix
24 Precomputed,
25}
26
27/// Eigengap heuristic to estimate the number of clusters
28///
29/// This function implements the eigengap heuristic, which estimates
30/// the number of clusters based on the differences between consecutive
31/// eigenvalues.
32///
33/// # Arguments
34///
35/// * `eigenvalues` - Array of eigenvalues sorted in ascending order
36/// * `max_clusters` - Maximum number of clusters to consider
37///
38/// # Returns
39///
40/// * The estimated number of clusters
41#[allow(dead_code)]
42fn eigengap_heuristic<F>(eigenvalues: &[F], max_clusters: usize) -> usize
43where
44 F: Float + FromPrimitive + Debug + PartialOrd,
45{
46 // Find the largest eigengap among the first max_clusters eigenvalues
47 let n = eigenvalues.len();
48 let mut max_gap = F::zero();
49 let mut max_gap_idx = 1; // Default to 1 cluster
50
51 for i in 0..(max_clusters.min(n - 1)) {
52 let gap = eigenvalues[i + 1] - eigenvalues[i];
53 if gap > max_gap {
54 max_gap = gap;
55 max_gap_idx = i + 1;
56 }
57 }
58
59 max_gap_idx
60}
61
62/// Normalized graph Laplacian
63///
64/// This function computes the normalized graph Laplacian from an affinity matrix.
65/// The normalized Laplacian is defined as:
66/// L_norm = I - D^(-1/2) A D^(-1/2)
67/// where A is the affinity matrix and D is the diagonal matrix of degrees.
68///
69/// # Arguments
70///
71/// * `affinity` - Affinity matrix
72///
73/// # Returns
74///
75/// * The normalized graph Laplacian matrix
76#[allow(dead_code)]
77fn normalized_laplacian<F>(affinity: &Array2<F>) -> Result<Array2<F>>
78where
79 F: Float + FromPrimitive + Debug + PartialOrd,
80{
81 let n = affinity.shape()[0];
82 if n != affinity.shape()[1] {
83 return Err(ClusteringError::InvalidInput(
84 "Affinity matrix must be square".to_string(),
85 ));
86 }
87
88 // Calculate row sums (degrees)
89 let mut degrees = Array1::zeros(n);
90 for i in 0..n {
91 degrees[i] = affinity.row(i).sum();
92 }
93
94 // Calculate D^(-1/2)
95 let mut d_inv_sqrt = Array1::zeros(n);
96 for i in 0..n {
97 if degrees[i] <= F::epsilon() {
98 return Err(ClusteringError::ComputationError(
99 "Degree matrix contains zero values, graph may be disconnected".to_string(),
100 ));
101 }
102 d_inv_sqrt[i] = F::one() / degrees[i].sqrt();
103 }
104
105 // Calculate normalized Laplacian L_norm = I - D^(-1/2) A D^(-1/2)
106 let mut laplacian = Array2::zeros((n, n));
107
108 for i in 0..n {
109 for j in 0..n {
110 if i == j {
111 // Diagonal elements of identity matrix I minus the normalized _affinity
112 laplacian[[i, j]] = F::one() - affinity[[i, j]] * d_inv_sqrt[i] * d_inv_sqrt[j];
113 } else {
114 // Off-diagonal elements are the negative normalized _affinity
115 laplacian[[i, j]] = -affinity[[i, j]] * d_inv_sqrt[i] * d_inv_sqrt[j];
116 }
117 }
118 }
119
120 Ok(laplacian)
121}
122
123/// Create a K-nearest neighbor affinity matrix
124///
125/// # Arguments
126///
127/// * `data` - Input data
128/// * `n_neighbors` - Number of neighbors to consider for each point
129///
130/// # Returns
131///
132/// * Affinity matrix where each row has at most n_neighbors non-zero entries
133#[allow(dead_code)]
134fn knn_affinity<F>(data: ArrayView2<F>, n_neighbors: usize) -> Result<Array2<F>>
135where
136 F: Float + FromPrimitive + Debug + PartialOrd,
137{
138 let n_samples = data.shape()[0];
139 let n_features = data.shape()[1];
140
141 // Ensure n_neighbors is valid
142 if n_neighbors >= n_samples {
143 return Err(ClusteringError::InvalidInput(format!(
144 "n_neighbors ({}) must be less than the number of samples ({})",
145 n_neighbors, n_samples
146 )));
147 }
148
149 // Calculate pairwise distances
150 let mut dist_matrix = Array2::zeros((n_samples, n_samples));
151
152 for i in 0..n_samples {
153 for j in (i + 1)..n_samples {
154 let mut dist_sq = F::zero();
155 for k in 0..n_features {
156 let diff = data[[i, k]] - data[[j, k]];
157 dist_sq = dist_sq + diff * diff;
158 }
159 let dist = dist_sq.sqrt();
160
161 dist_matrix[[i, j]] = dist;
162 dist_matrix[[j, i]] = dist; // Symmetric
163 }
164 }
165
166 // Create KNN affinity matrix
167 let mut affinity = Array2::zeros((n_samples, n_samples));
168
169 for i in 0..n_samples {
170 // Get distances from point i to all other points
171 let mut distances: Vec<(usize, F)> = (0..n_samples)
172 .filter(|&j| i != j) // Exclude self
173 .map(|j| (j, dist_matrix[[i, j]]))
174 .collect();
175
176 // Sort by distance
177 distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
178
179 // Select k nearest neighbors
180 for (j, _) in distances.iter().take(n_neighbors.min(distances.len())) {
181 // Create binary adjacency matrix (1 for neighbors, 0 otherwise)
182 affinity[[i, *j]] = F::one();
183 // Make it symmetric
184 affinity[[*j, i]] = F::one();
185 }
186 }
187
188 Ok(affinity)
189}
190
191/// Create a RBF kernel affinity matrix
192///
193/// # Arguments
194///
195/// * `data` - Input data
196/// * `gamma` - RBF kernel parameter (1/(2*sigma^2))
197///
198/// # Returns
199///
200/// * Affinity matrix where each element (i,j) is exp(-gamma * ||x_i - x_j||^2)
201#[allow(dead_code)]
202fn rbf_affinity<F>(data: ArrayView2<F>, gamma: F) -> Result<Array2<F>>
203where
204 F: Float + FromPrimitive + Debug + PartialOrd,
205{
206 let n_samples = data.shape()[0];
207 let n_features = data.shape()[1];
208
209 if gamma <= F::zero() {
210 return Err(ClusteringError::InvalidInput(
211 "gamma must be positive".to_string(),
212 ));
213 }
214
215 // Calculate pairwise distances and apply RBF kernel
216 let mut affinity = Array2::zeros((n_samples, n_samples));
217
218 for i in 0..n_samples {
219 // Diagonal is 1 (distance to self is 0)
220 affinity[[i, i]] = F::one();
221
222 for j in (i + 1)..n_samples {
223 let mut dist_sq = F::zero();
224 for k in 0..n_features {
225 let diff = data[[i, k]] - data[[j, k]];
226 dist_sq = dist_sq + diff * diff;
227 }
228
229 // Apply RBF kernel: exp(-gamma * ||x_i - x_j||^2)
230 let affinity_val = (-gamma * dist_sq).exp();
231
232 affinity[[i, j]] = affinity_val;
233 affinity[[j, i]] = affinity_val; // Symmetric
234 }
235 }
236
237 Ok(affinity)
238}
239
240/// Options for spectral clustering
241#[derive(Debug, Clone)]
242pub struct SpectralClusteringOptions<F: Float> {
243 /// Method to build the affinity matrix
244 pub affinity: AffinityMode,
245
246 /// Number of neighbors for nearest neighbors affinity
247 pub n_neighbors: usize,
248
249 /// Parameter for RBF kernel (1/(2*sigma^2))
250 pub gamma: F,
251
252 /// Whether to use normalized graph Laplacian
253 pub normalized_laplacian: bool,
254
255 /// Maximum number of iterations for k-means
256 pub max_iter: usize,
257
258 /// Number of k-means initializations to run
259 pub n_init: usize,
260
261 /// Convergence threshold for k-means
262 pub tol: F,
263
264 /// Random seed for initialization
265 pub random_seed: Option<u64>,
266
267 /// Method for postprocessing eigenvectors
268 pub eigen_solver: String,
269
270 /// Whether to automatically detect number of clusters using eigengap heuristic
271 pub auto_n_clusters: bool,
272}
273
274impl<F: Float + FromPrimitive> Default for SpectralClusteringOptions<F> {
275 fn default() -> Self {
276 Self {
277 affinity: AffinityMode::RBF,
278 n_neighbors: 10,
279 gamma: F::from(1.0).unwrap(),
280 normalized_laplacian: true,
281 max_iter: 300,
282 n_init: 10,
283 tol: F::from(1e-4).unwrap(),
284 random_seed: None,
285 eigen_solver: "arpack".to_string(),
286 auto_n_clusters: false,
287 }
288 }
289}
290
291/// Spectral clustering
292///
293/// Spectral clustering uses the eigenvalues of a similarity matrix to perform
294/// dimensionality reduction before clustering in fewer dimensions.
295///
296/// # Arguments
297///
298/// * `data` - Input data or affinity matrix (n_samples × n_features) or (n_samples × n_samples)
299/// * `n_clusters` - Number of clusters to find
300/// * `options` - Optional parameters
301///
302/// # Returns
303///
304/// * Tuple of (embeddings, labels) where:
305/// - embeddings: Array of shape (n_samples × n_clusters) with spectral embeddings
306/// - labels: Array of shape (n_samples,) with cluster assignments
307///
308/// # Examples
309///
310/// ```
311/// use scirs2_core::ndarray::{Array2, ArrayView2};
312/// use scirs2_cluster::spectral::{spectral_clustering, SpectralClusteringOptions, AffinityMode};
313///
314/// // Example data with two ring-shaped clusters
315/// let data = Array2::from_shape_vec((20, 2), vec![
316/// // First ring
317/// 1.0, 0.0, 0.87, 0.5, 0.5, 0.87, 0.0, 1.0, -0.5, 0.87,
318/// -0.87, 0.5, -1.0, 0.0, -0.87, -0.5, -0.5, -0.87, 0.0, -1.0,
319/// // Second ring (larger radius)
320/// 4.0, 0.0, 3.46, 2.0, 2.0, 3.46, 0.0, 4.0, -2.0, 3.46,
321/// -3.46, 2.0, -4.0, 0.0, -3.46, -2.0, -2.0, -3.46, 0.0, -4.0,
322/// ]).unwrap();
323///
324/// // Run spectral clustering with RBF affinity
325/// let options = SpectralClusteringOptions {
326/// affinity: AffinityMode::RBF,
327/// gamma: 0.5, // Adjust based on the scale of your data
328/// ..Default::default()
329/// };
330///
331/// let (embeddings, labels) = spectral_clustering(data.view(), 2, Some(options)).unwrap();
332///
333/// // Print the results
334/// println!("Cluster assignments: {:?}", labels);
335/// ```
336#[allow(dead_code)]
337pub fn spectral_clustering<F>(
338 data: ArrayView2<F>,
339 n_clusters: usize,
340 options: Option<SpectralClusteringOptions<F>>,
341) -> Result<(Array2<F>, Array1<usize>)>
342where
343 F: Float
344 + FromPrimitive
345 + Debug
346 + PartialOrd
347 + ScalarOperand
348 + 'static
349 + std::iter::Sum
350 + std::ops::AddAssign
351 + std::ops::SubAssign
352 + std::ops::MulAssign
353 + std::ops::DivAssign
354 + std::ops::RemAssign
355 + std::fmt::Display
356 + Send
357 + Sync,
358{
359 let opts = options.unwrap_or_default();
360 let n_samples = data.shape()[0];
361
362 // Use unified validation
363 scirs2_core::validation::clustering::validate_clustering_data(
364 &data,
365 "Spectral clustering",
366 false,
367 Some(2),
368 )
369 .map_err(|e| ClusteringError::InvalidInput(format!("Spectral clustering: {}", e)))?;
370
371 // Spectral clustering requires at least 2 _clusters
372 if n_clusters < 2 {
373 return Err(ClusteringError::InvalidInput(format!(
374 "Spectral clustering: number of _clusters must be >= 2, got {}",
375 n_clusters
376 )));
377 }
378
379 scirs2_core::validation::clustering::check_n_clusters_bounds(
380 &data,
381 n_clusters,
382 "Spectral clustering",
383 )
384 .map_err(|e| ClusteringError::InvalidInput(format!("{}", e)))?;
385
386 // Step 1: Create the affinity matrix
387 let affinity = match opts.affinity {
388 AffinityMode::NearestNeighbors => {
389 // Check if data is a square matrix (precomputed affinity)
390 if data.shape()[0] == data.shape()[1] {
391 // Assuming it's already a precomputed affinity matrix
392 data.to_owned()
393 } else {
394 // Create KNN affinity matrix
395 knn_affinity(data, opts.n_neighbors)?
396 }
397 }
398 AffinityMode::RBF => {
399 // Check if data is a square matrix (precomputed affinity)
400 if data.shape()[0] == data.shape()[1] {
401 // Assuming it's already a precomputed affinity matrix
402 data.to_owned()
403 } else {
404 // Create RBF kernel affinity matrix
405 rbf_affinity(data, opts.gamma)?
406 }
407 }
408 AffinityMode::Precomputed => {
409 // Verify that data is a square matrix
410 if data.shape()[0] != data.shape()[1] {
411 return Err(ClusteringError::InvalidInput(
412 "For precomputed affinity, data must be a square matrix".to_string(),
413 ));
414 }
415 data.to_owned()
416 }
417 };
418
419 // Step 2: Compute the graph Laplacian
420 let laplacian = if opts.normalized_laplacian {
421 normalized_laplacian(&affinity)?
422 } else {
423 // Unnormalized Laplacian L = D - A
424 let mut lap = Array2::zeros((n_samples, n_samples));
425
426 // Calculate degrees (diagonal elements of D)
427 let mut degrees = vec![F::zero(); n_samples];
428 for i in 0..n_samples {
429 degrees[i] = affinity.row(i).sum();
430 lap[[i, i]] = degrees[i];
431 }
432
433 // Subtract affinity matrix: L = D - A
434 for i in 0..n_samples {
435 for j in 0..n_samples {
436 lap[[i, j]] -= affinity[[i, j]];
437 }
438 }
439
440 lap
441 };
442
443 // Step 3: Compute the eigenvalues and eigenvectors
444 // Ensure numerical stability by adding a small value to the diagonal
445 let n = laplacian.nrows();
446 let mut stabilized_laplacian = laplacian.clone();
447 for i in 0..n {
448 stabilized_laplacian[[i, i]] += F::from(1e-10).unwrap();
449 }
450
451 // Use the stabilized matrix for eigenvalue decomposition
452 // For larger matrices (>3x3), use smallest_k_eigh to avoid "not implemented" error
453 let (eigenvalues, eigenvectors) = if n <= 3 {
454 // For small matrices, use the full eigh decomposition
455 eigh(&stabilized_laplacian.view(), None)?
456 } else {
457 // For larger matrices, use specialized function to get smallest eigenvalues
458 // We need at least n_clusters eigenvalues, but request a few more for robustness
459 let k = (n_clusters + 2).min(n); // Request at least n_clusters eigenvalues
460 let max_iter = 1000;
461 let tolerance = F::from(1e-10).unwrap();
462
463 smallest_k_eigh(&stabilized_laplacian.view(), k, max_iter, tolerance)?
464 };
465
466 // Determine the actual number of _clusters
467 let actual_n_clusters = if opts.auto_n_clusters {
468 // Use eigengap heuristic to determine the number of _clusters
469 // When using the normalized Laplacian, we need the smaller eigenvalues
470 eigengap_heuristic(&eigenvalues.to_vec(), n_clusters)
471 } else {
472 n_clusters
473 };
474
475 // Step 4: Choose the appropriate eigenvectors
476 // For the normalized Laplacian, we take the eigenvectors corresponding to the smallest eigenvalues
477 let embedding = if opts.normalized_laplacian {
478 // Extract n_clusters eigenvectors corresponding to the smallest eigenvalues
479 // Note: eigenvalues should already be sorted in ascending order
480 eigenvectors.slice(s![.., ..actual_n_clusters]).to_owned()
481 } else {
482 // For the unnormalized Laplacian, we skip the constant eigenvector (smallest eigenvalue)
483 eigenvectors
484 .slice(s![.., 1..(actual_n_clusters + 1)])
485 .to_owned()
486 };
487
488 // Step 5: Row normalization (optional for some algorithms)
489 let normalized_embedding = if opts.normalized_laplacian {
490 // Normalize each row to have unit norm
491 let mut norm_embedding = embedding.clone();
492
493 for i in 0..n_samples {
494 let row = embedding.row(i);
495 let norm: F = row.iter().map(|&x| x * x).sum::<F>().sqrt();
496
497 if norm > F::epsilon() {
498 for j in 0..actual_n_clusters {
499 norm_embedding[[i, j]] = embedding[[i, j]] / norm;
500 }
501 }
502 }
503
504 norm_embedding
505 } else {
506 embedding
507 };
508
509 // Step 6: Apply k-means clustering in the embedding space
510 let kmeans_opts = KMeansOptions {
511 max_iter: opts.max_iter,
512 tol: opts.tol,
513 random_seed: opts.random_seed,
514 n_init: opts.n_init,
515 init_method: KMeansInit::KMeansPlusPlus,
516 };
517
518 let (_, labels) = kmeans_with_options(
519 normalized_embedding.view(),
520 actual_n_clusters,
521 Some(kmeans_opts),
522 )?;
523
524 Ok((normalized_embedding, labels))
525}
526
527/// Fit a spectral bipartitioning model
528///
529/// This function finds a 2-cluster solution by analyzing the second
530/// eigenvector of the graph Laplacian.
531///
532/// # Arguments
533///
534/// * `data` - Input data or affinity matrix (n_samples × n_features) or (n_samples × n_samples)
535/// * `options` - Optional parameters
536///
537/// # Returns
538///
539/// * Array of shape (n_samples,) with binary cluster assignments
540#[allow(dead_code)]
541pub fn spectral_bipartition<F>(
542 data: ArrayView2<F>,
543 options: Option<SpectralClusteringOptions<F>>,
544) -> Result<Array1<usize>>
545where
546 F: Float
547 + FromPrimitive
548 + Debug
549 + PartialOrd
550 + ScalarOperand
551 + 'static
552 + std::iter::Sum
553 + std::ops::AddAssign
554 + std::ops::SubAssign
555 + std::ops::MulAssign
556 + std::ops::DivAssign
557 + std::ops::RemAssign
558 + std::fmt::Display
559 + Send
560 + Sync,
561{
562 // Run spectral clustering with exactly 2 clusters
563 let (_, labels) = spectral_clustering(data, 2, options)?;
564 Ok(labels)
565}
566
567#[cfg(test)]
568mod tests {
569 use super::*;
570 use scirs2_core::ndarray::Array2;
571
572 #[test]
573 fn test_spectral_clustering_basic() {
574 // Create a dataset with 2 well-separated clusters
575 let data = Array2::from_shape_vec(
576 (6, 2),
577 vec![
578 // Cluster 1
579 1.0, 1.0, 1.1, 1.1, 0.9, 0.9, // Cluster 2
580 5.0, 5.0, 5.1, 5.1, 4.9, 4.9,
581 ],
582 )
583 .unwrap();
584
585 // Run spectral clustering with adjusted parameters
586 let options = SpectralClusteringOptions {
587 affinity: AffinityMode::RBF,
588 gamma: 0.1, // Smaller gamma for more global connectivity
589 normalized_laplacian: false, // Try unnormalized Laplacian first
590 ..Default::default()
591 };
592
593 let result = spectral_clustering(data.view(), 2, Some(options));
594
595 // Check if it doesn't panic/fail
596 assert!(
597 result.is_ok(),
598 "Spectral clustering should not fail: {:?}",
599 result.err()
600 );
601
602 let (embeddings, labels) = result.unwrap();
603
604 // Check dimensions
605 assert_eq!(embeddings.shape()[0], 6);
606 assert_eq!(labels.len(), 6);
607
608 // Check that we have at most 2 clusters (could be 1 if all merged)
609 let unique_labels: std::collections::HashSet<_> = labels.iter().cloned().collect();
610 assert!(
611 unique_labels.len() <= 2,
612 "Should have at most 2 clusters, got: {:?}",
613 unique_labels
614 );
615
616 // Check that all labels are valid (0 or 1)
617 assert!(
618 labels.iter().all(|&l| l == 0 || l == 1),
619 "All labels should be 0 or 1, got: {:?}",
620 labels
621 );
622 }
623
624 #[test]
625 fn test_spectral_clustering_ring() {
626 // Create two concentric ring-shaped clusters
627 // This is a more realistic test that validates spectral clustering works
628 let data = Array2::from_shape_vec(
629 (16, 2),
630 vec![
631 // First ring (8 points)
632 1.0, 0.0, 0.7, 0.7, 0.0, 1.0, -0.7, 0.7, -1.0, 0.0, -0.7, -0.7, 0.0, -1.0, 0.7,
633 -0.7, // Second ring (8 points)
634 3.0, 0.0, 2.1, 2.1, 0.0, 3.0, -2.1, 2.1, -3.0, 0.0, -2.1, -2.1, 0.0, -3.0, 2.1,
635 -2.1,
636 ],
637 )
638 .unwrap();
639
640 // K-means would fail on this dataset because the clusters are not linearly separable
641 // but spectral clustering can work with appropriate parameters
642
643 // Run spectral clustering with carefully tuned parameters
644 let options = SpectralClusteringOptions {
645 affinity: AffinityMode::RBF,
646 gamma: 0.05, // Smaller gamma for more global connectivity
647 n_init: 5, // Fewer initializations for testing
648 normalized_laplacian: false, // Sometimes unnormalized works better for rings
649 ..Default::default()
650 };
651
652 let result = spectral_clustering(data.view(), 2, Some(options));
653 assert!(
654 result.is_ok(),
655 "Spectral clustering should not fail: {:?}",
656 result.err()
657 );
658
659 let (_, labels) = result.unwrap();
660
661 // Check that we have at most 2 clusters
662 let unique_labels: std::collections::HashSet<_> = labels.iter().cloned().collect();
663 assert!(
664 unique_labels.len() <= 2,
665 "Should have at most 2 clusters, got: {:?}",
666 unique_labels
667 );
668
669 // Check that points are clustered (relaxed test since spectral clustering
670 // is sensitive to parameters and might not perfectly separate rings)
671 // Just check that not all points have the same label
672 let first_label = labels[0];
673 let all_same = labels.iter().all(|&l| l == first_label);
674 assert!(!all_same, "All points should not be in the same cluster");
675
676 // Verify all labels are valid (0 or 1)
677 assert!(
678 labels.iter().all(|&l| l == 0 || l == 1),
679 "All labels should be 0 or 1"
680 );
681 }
682}