Skip to main content

torsh_cluster/
lib.rs

1//! Unsupervised learning and clustering algorithms for ToRSh
2//!
3//! This crate provides PyTorch-compatible clustering algorithms built on top of
4//! the SciRS2 ecosystem, offering high-performance implementations of popular
5//! clustering methods with extensive customization options.
6//!
7//! # Key Features
8//!
9//! - **K-Means Clustering**: Classic centroid-based clustering with multiple variants (Lloyd, Elkan, Mini-batch)
10//! - **Gaussian Mixture Models**: Probabilistic clustering with EM algorithm (Full, Diagonal, Spherical covariance)
11//! - **Spectral Clustering**: Graph-based clustering using eigendecomposition
12//! - **DBSCAN**: Density-based clustering for arbitrary-shaped clusters with noise detection
13//! - **HDBSCAN**: Hierarchical DBSCAN for varying density clusters
14//! - **OPTICS**: Ordering Points To Identify the Clustering Structure with reachability plots
15//! - **Hierarchical Clustering**: Agglomerative clustering with multiple linkage methods
16//! - **Online K-Means**: Incremental clustering for streaming data with concept drift detection
17//! - **Evaluation Metrics**: Comprehensive metrics including silhouette, ARI, NMI, Gap Statistic, and more
18//!
19//! # SciRS2 Integration
20//!
21//! All clustering algorithms are built on `scirs2-cluster` foundation:
22//! - Leverages scirs2-core for random number generation and array operations
23//! - Uses scirs2-stats for statistical computations
24//! - Integrates with scirs2-metrics for clustering evaluation
25//! - Employs scirs2-linalg for linear algebra operations
26//!
27//! # Example Usage
28//!
29//! ```rust
30//! use torsh_cluster::prelude::*;
31//! use torsh_tensor::creation::randn;
32//!
33//! // Create sample data
34//! let data = randn::<f32>(&[100, 2])?;
35//!
36//! // Perform K-means clustering
37//! let kmeans = KMeans::new(3)
38//!     .max_iters(100)
39//!     .tolerance(1e-4);
40//!
41//! let result = kmeans.fit(&data)?;
42//! println!("Cluster centers: {:?}", result.centroids);
43//! println!("Labels: {:?}", result.labels);
44//! # Ok::<(), Box<dyn std::error::Error>>(())
45//! ```
46
47#![cfg_attr(not(feature = "std"), no_std)]
48
49#[cfg(not(feature = "std"))]
50extern crate alloc;
51
52pub mod algorithms;
53pub mod error;
54pub mod evaluation;
55pub mod initialization;
56pub mod traits;
57pub mod utils;
58
59// Re-export core clustering algorithms
60pub use algorithms::{
61    dbscan::{DBSCANConfig, DBSCANResult, HDBSCANConfig, HDBSCANResult, DBSCAN, HDBSCAN},
62    gaussian_mixture::{GMConfig, GMResult, GaussianMixture},
63    hierarchical::{AgglomerativeClustering, HierarchicalResult, Linkage},
64    incremental::{
65        IncrementalClustering, OnlineKMeans, OnlineKMeansConfig, OnlineKMeansResult,
66        SlidingWindowConfig, SlidingWindowKMeans, SlidingWindowResult,
67    },
68    kmeans::{InitMethod, KMeans, KMeansAlgorithm, KMeansConfig, KMeansResult},
69    optics::{OPTICSConfig, OPTICSResult, OPTICS},
70    spectral::{SpectralClustering, SpectralConfig, SpectralResult},
71};
72
73// Re-export evaluation metrics
74pub use evaluation::{
75    metrics::{
76        adjusted_mutual_info_score, adjusted_rand_score, calinski_harabasz_score,
77        davies_bouldin_score, fowlkes_mallows_score, homogeneity_score,
78        normalized_mutual_info_score, silhouette_score, v_measure_score,
79    },
80    ClusteringMetric, EvaluationResult,
81};
82
83// Re-export initialization methods
84pub use initialization::{
85    forgy::Forgy, kmeans_plus_plus::KMeansPlusPlus, random_partition::RandomPartition,
86    InitializationStrategy,
87};
88
89// Re-export traits
90pub use traits::{ClusteringAlgorithm, ClusteringResult, Fit, FitPredict, Transform};
91
92// Re-export utilities
93pub use utils::{
94    adaptive::{suggest_dbscan_params, suggest_epsilon},
95    distance::{cosine_distance, euclidean_distance, manhattan_distance, DistanceMetric},
96    drift_detection::{CompositeDriftDetector, DriftStatus, PageHinkleyTest, ADWIN, DDM},
97    memory_efficient::{ChunkedDataProcessor, IncrementalCentroidUpdater, MemoryEfficientConfig},
98    preprocessing::{normalize_features, standardize_features, PreprocessingMethod},
99    validation::{validate_cluster_input, validate_n_clusters, ClusterValidation},
100};
101
102// Re-export error types
103pub use error::{ClusterError, ClusterResult};
104
105/// Version information
106pub const VERSION: &str = env!("CARGO_PKG_VERSION");
107pub const VERSION_MAJOR: u32 = 0;
108pub const VERSION_MINOR: u32 = 1;
109pub const VERSION_PATCH: u32 = 0;
110
111/// Prelude module for convenient imports
112pub mod prelude {
113
114    pub use crate::algorithms::{
115        dbscan::{DBSCANConfig, DBSCAN},
116        gaussian_mixture::{GMConfig, GaussianMixture},
117        hierarchical::{AgglomerativeClustering, Linkage},
118        kmeans::{InitMethod, KMeans, KMeansConfig},
119        spectral::{SpectralClustering, SpectralConfig},
120    };
121
122    pub use crate::evaluation::{
123        metrics::{adjusted_rand_score, silhouette_score},
124        ClusteringMetric,
125    };
126
127    pub use crate::initialization::{
128        Forgy, InitializationStrategy, KMeansPlusPlus, RandomPartition,
129    };
130
131    pub use crate::traits::{ClusteringAlgorithm, ClusteringResult, Fit, FitPredict, Transform};
132
133    pub use crate::utils::{
134        distance::{cosine_distance, euclidean_distance, DistanceMetric},
135        preprocessing::{normalize_features, standardize_features},
136    };
137
138    pub use crate::error::{ClusterError, ClusterResult};
139}
140
141#[cfg(test)]
142mod tests {
143    use super::*;
144    use approx::assert_relative_eq;
145    use torsh_tensor::Tensor;
146
147    #[test]
148    fn test_version_info() {
149        assert!(!VERSION.is_empty());
150        assert_eq!(VERSION_MAJOR, 0);
151        assert_eq!(VERSION_MINOR, 1);
152        assert_eq!(VERSION_PATCH, 0);
153    }
154
155    #[test]
156    fn test_dbscan_basic() -> Result<(), Box<dyn std::error::Error>> {
157        // Simple 2D dataset with clear clusters
158        let data = Tensor::from_vec(
159            vec![
160                // Cluster 1 (around origin)
161                0.0, 0.0, 0.1, 0.1, 0.0, 0.2, 0.2, 0.0, // Cluster 2 (around (5,5))
162                5.0, 5.0, 5.1, 5.1, 5.0, 5.2, 5.2, 5.0,
163                // Noise point (far from both clusters)
164                10.0, 10.0,
165            ],
166            &[9, 2],
167        )?;
168
169        let dbscan = DBSCAN::new(0.5, 2);
170        let result = dbscan.fit(&data)?;
171
172        // Should find 2 clusters and 1 noise point
173        assert_eq!(result.n_clusters, 2);
174        assert_eq!(result.noise_points.len(), 1);
175        assert!(!result.core_sample_indices.is_empty());
176
177        Ok(())
178    }
179
180    #[test]
181    fn test_hierarchical_basic() -> Result<(), Box<dyn std::error::Error>> {
182        // Simple 2D dataset
183        let data = Tensor::from_vec(vec![0.0, 0.0, 0.1, 0.1, 5.0, 5.0, 5.1, 5.1], &[4, 2])?;
184
185        let hierarchical = AgglomerativeClustering::new(2);
186        let result = hierarchical.fit(&data)?;
187
188        // Should find exactly 2 clusters
189        assert_eq!(result.n_clusters, 2);
190
191        // Check labels are valid (should be 0 and 1)
192        let labels_vec = result.labels.to_vec()?;
193        let unique_labels: std::collections::HashSet<i32> =
194            labels_vec.iter().map(|&x| x as i32).collect();
195        assert_eq!(unique_labels.len(), 2);
196
197        Ok(())
198    }
199
200    #[test]
201    fn test_silhouette_score_basic() -> Result<(), Box<dyn std::error::Error>> {
202        // Simple well-separated clusters
203        let data = Tensor::from_vec(
204            vec![
205                // Cluster 0
206                0.0, 0.0, 0.1, 0.1, // Cluster 1
207                10.0, 10.0, 10.1, 10.1,
208            ],
209            &[4, 2],
210        )?;
211
212        let labels = Tensor::from_vec(vec![0.0, 0.0, 1.0, 1.0], &[4])?;
213
214        let score = silhouette_score(&data, &labels)?;
215
216        // Well-separated clusters should have high silhouette score (close to 1.0)
217        assert!(
218            score > 0.5,
219            "Silhouette score should be positive for well-separated clusters"
220        );
221
222        Ok(())
223    }
224
225    #[test]
226    fn test_adjusted_rand_score_perfect() -> Result<(), Box<dyn std::error::Error>> {
227        // Perfect clustering match
228        let labels_true = Tensor::from_vec(vec![0.0, 0.0, 1.0, 1.0], &[4])?;
229        let labels_pred = Tensor::from_vec(vec![0.0, 0.0, 1.0, 1.0], &[4])?;
230
231        let score = adjusted_rand_score(&labels_true, &labels_pred)?;
232
233        // Perfect match should give score of 1.0
234        assert_relative_eq!(score, 1.0, epsilon = 1e-6);
235
236        Ok(())
237    }
238
239    #[test]
240    fn test_calinski_harabasz_score() -> Result<(), Box<dyn std::error::Error>> {
241        // Well-separated clusters should have high CH score
242        let data = Tensor::from_vec(
243            vec![
244                // Cluster 0
245                0.0, 0.0, 0.1, 0.1, // Cluster 1
246                5.0, 5.0, 5.1, 5.1,
247            ],
248            &[4, 2],
249        )?;
250
251        let labels = Tensor::from_vec(vec![0.0, 0.0, 1.0, 1.0], &[4])?;
252
253        let score = calinski_harabasz_score(&data, &labels)?;
254
255        // Well-separated clusters should have high CH score
256        assert!(
257            score > 1.0,
258            "Calinski-Harabasz score should be > 1 for well-separated clusters"
259        );
260
261        Ok(())
262    }
263
264    #[test]
265    fn test_davies_bouldin_score() -> Result<(), Box<dyn std::error::Error>> {
266        // Well-separated clusters should have low DB score
267        let data = Tensor::from_vec(
268            vec![
269                // Cluster 0
270                0.0, 0.0, 0.1, 0.1, // Cluster 1
271                10.0, 10.0, 10.1, 10.1,
272            ],
273            &[4, 2],
274        )?;
275
276        let labels = Tensor::from_vec(vec![0.0, 0.0, 1.0, 1.0], &[4])?;
277
278        let score = davies_bouldin_score(&data, &labels)?;
279
280        // Well-separated clusters should have low DB score (closer to 0 is better)
281        assert!(score >= 0.0, "Davies-Bouldin score should be non-negative");
282        assert!(
283            score < 5.0,
284            "Davies-Bouldin score should be reasonable for well-separated clusters"
285        );
286
287        Ok(())
288    }
289
290    #[test]
291    fn test_gmm_basic() -> Result<(), Box<dyn std::error::Error>> {
292        use algorithms::gaussian_mixture::{CovarianceType, GaussianMixture};
293
294        // Simple 2D dataset with clear clusters
295        let data = Tensor::from_vec(
296            vec![
297                // Cluster 0 (around origin)
298                0.0, 0.0, 0.1, 0.1, -0.1, 0.1, 0.1, -0.1, // Cluster 1 (around (5,5))
299                5.0, 5.0, 5.1, 5.1, 4.9, 5.1, 5.1, 4.9,
300            ],
301            &[8, 2],
302        )?;
303
304        let gmm = GaussianMixture::new(2)
305            .covariance_type(CovarianceType::Diag)
306            .max_iters(50)
307            .tolerance(1e-3);
308
309        let result = gmm.fit(&data)?;
310
311        // Should find exactly 2 clusters
312        assert_eq!(result.n_clusters(), 2);
313
314        // Should have proper shapes
315        assert_eq!(result.means.shape().dims(), &[2, 2]); // 2 components, 2 features
316        assert_eq!(result.weights.shape().dims(), &[2]); // 2 components
317        assert_eq!(result.labels.shape().dims(), &[8]); // 8 samples
318        assert_eq!(result.responsibilities.shape().dims(), &[8, 2]); // 8 samples, 2 components
319
320        // Log-likelihood should be reasonable (not negative infinity)
321        assert!(result.log_likelihood > f64::NEG_INFINITY);
322
323        // AIC and BIC should be finite
324        assert!(result.aic.is_finite());
325        assert!(result.bic.is_finite());
326
327        // Number of iterations should be reasonable
328        assert!(result.n_iter <= 50);
329
330        println!(
331            "GMM test passed - log_likelihood: {}, AIC: {}, BIC: {}, converged: {}",
332            result.log_likelihood, result.aic, result.bic, result.converged
333        );
334
335        Ok(())
336    }
337
338    #[test]
339    fn test_spectral_clustering_basic() -> Result<(), Box<dyn std::error::Error>> {
340        use algorithms::spectral::{AffinityType, SpectralClustering};
341
342        // Simple 2D dataset with clear clusters
343        let data = Tensor::from_vec(
344            vec![
345                // Cluster 0 (around origin)
346                0.0, 0.0, 0.1, 0.1, -0.1, 0.1, 0.1, -0.1, // Cluster 1 (around (3,3))
347                3.0, 3.0, 3.1, 3.1, 2.9, 3.1, 3.1, 2.9,
348            ],
349            &[8, 2],
350        )?;
351
352        let spectral = SpectralClustering::new(2)
353            .affinity(AffinityType::Rbf)
354            .gamma(1.0);
355
356        let result = spectral.fit(&data)?;
357
358        // Should find exactly 2 clusters
359        assert_eq!(result.n_clusters(), 2);
360
361        // Should have proper shapes
362        assert_eq!(result.labels.shape().dims(), &[8]); // 8 samples
363        assert_eq!(result.affinity_matrix.shape().dims(), &[8, 8]); // 8x8 affinity matrix
364        assert_eq!(result.embedding.shape().dims(), &[8, 2]); // 8 samples, 2 clusters embedding
365        assert_eq!(result.eigenvalues.shape().dims(), &[8]); // 8 eigenvalues
366
367        // Should have successfully created embedding
368        assert!(result.embedding_success);
369
370        // K-means iterations should be reasonable
371        assert!(result.kmeans_iterations <= 100);
372
373        println!(
374            "Spectral clustering test passed - embedding_success: {}, kmeans_iterations: {}",
375            result.embedding_success, result.kmeans_iterations
376        );
377
378        Ok(())
379    }
380}