scirs2_cluster/
vq.rs

1//! Vector quantization functions
2//!
3//! This module provides vector quantization algorithms like K-means clustering
4//! and related utilities.
5//!
6//! ## Examples
7//!
8//! ```
9//! use scirs2_core::ndarray::{ArrayView1, Array2, ArrayView2};
10//! use scirs2_cluster::vq::kmeans;
11//!
12//! // Example data
13//! let data = Array2::from_shape_vec((6, 2), vec![
14//!     1.0, 2.0,
15//!     1.2, 1.8,
16//!     0.8, 1.9,
17//!     3.7, 4.2,
18//!     3.9, 3.9,
19//!     4.2, 4.1,
20//! ]).unwrap();
21//!
22//! // Run k-means with k=2
23//! let (centroids, labels) = kmeans(ArrayView2::from(&data), 2, None, None, None, None).unwrap();
24//!
25//! // Print the results
26//! println!("Centroids: {:?}", centroids);
27//! println!("Labels: {:?}", labels);
28//! ```
29
30use scirs2_core::ndarray::{s, Array1, Array2, ArrayView1, ArrayView2};
31use scirs2_core::numeric::{Float, FromPrimitive};
32use std::fmt::Debug;
33
34use crate::error::{ClusteringError, Result};
35
36mod distance_metrics;
37mod distance_simd;
38mod kmeans;
39mod kmeans2;
40mod minibatch_kmeans;
41mod parallel_kmeans;
42mod simd_kmeans;
43mod simd_optimizations;
44mod weighted_kmeans;
45pub use self::distance_metrics::{
46    create_metric, ChebyshevDistance, CorrelationDistance, CosineDistance,
47    DistanceMetric as VQDistanceMetric, EuclideanDistance, MahalanobisDistance, ManhattanDistance,
48    MetricType, MinkowskiDistance,
49};
50pub use distance_simd::{
51    distance_to_centroids_simd, pairwise_euclidean_parallel, pairwise_euclidean_simd,
52};
53pub use kmeans::{
54    kmeans, kmeans_init, kmeans_plus_plus, kmeans_with_metric, kmeans_with_options, KMeansInit,
55    KMeansOptions,
56};
57pub use kmeans2::{kmeans2, kmeans2_str, MinitMethod, MissingMethod};
58pub use minibatch_kmeans::*;
59pub use parallel_kmeans::{parallel_kmeans, ParallelKMeansOptions};
60pub use simd_kmeans::{kmeans_plus_plus_simd, kmeans_simd, mini_batch_kmeans_simd};
61pub use simd_optimizations::{
62    calculate_distortion_simd, compute_centroids_simd, euclidean_distance_simd, vq_simd,
63    whiten_simd, SimdOptimizationConfig,
64};
65pub use weighted_kmeans::{weighted_kmeans, weighted_kmeans_plus_plus, WeightedKMeansOptions};
66
67/// Computes the Euclidean distance between two vectors
68#[allow(dead_code)]
69pub fn euclidean_distance<F>(x: ArrayView1<F>, y: ArrayView1<F>) -> F
70where
71    F: Float + FromPrimitive,
72{
73    let mut sum = F::zero();
74    for i in 0..x.len() {
75        let diff = x[i] - y[i];
76        sum = sum + diff * diff;
77    }
78    sum.sqrt()
79}
80
81/// Normalize a group of observations on a per feature basis.
82///
83/// Before running k-means, it is beneficial to rescale each feature
84/// dimension of the observation set by its standard deviation (i.e. "whiten"
85/// it - as in "white noise" where each frequency has equal power).
86/// Each feature is divided by its standard deviation across all observations
87/// to give it unit variance.
88///
89/// # Arguments
90///
91/// * `obs` - Input data (n_samples × n_features)
92/// * `check_finite` - Whether to check that the input contains only finite values
93///
94/// # Returns
95///
96/// * Whitened array with the same shape as input
97///
98/// # Examples
99///
100/// ```
101/// use scirs2_core::ndarray::Array2;
102/// use scirs2_cluster::vq::whiten;
103///
104/// let data = Array2::<f64>::from_shape_vec((4, 2), vec![
105///     1.0, 2.0,
106///     1.5, 2.5,
107///     0.5, 1.5,
108///     2.0, 3.0,
109/// ]).unwrap();
110///
111/// let whitened = whiten(&data).unwrap();
112/// ```
113#[allow(dead_code)]
114pub fn whiten<F>(obs: &Array2<F>) -> Result<Array2<F>>
115where
116    F: Float + FromPrimitive + std::fmt::Debug,
117{
118    let n_samples = obs.shape()[0];
119    let n_features = obs.shape()[1];
120
121    // Calculate mean for each feature
122    let mut means = Array1::<F>::zeros(n_features);
123    for j in 0..n_features {
124        let mut sum = F::zero();
125        for i in 0..n_samples {
126            sum = sum + obs[[i, j]];
127        }
128        means[j] = sum / F::from(n_samples).unwrap();
129    }
130
131    // Calculate standard deviation for each feature
132    let mut stds = Array1::<F>::zeros(n_features);
133    for j in 0..n_features {
134        let mut sum = F::zero();
135        for i in 0..n_samples {
136            let diff = obs[[i, j]] - means[j];
137            sum = sum + diff * diff;
138        }
139        stds[j] = (sum / F::from(n_samples - 1).unwrap()).sqrt();
140
141        // Avoid division by zero
142        if stds[j] < F::from(1e-10).unwrap() {
143            stds[j] = F::one();
144        }
145    }
146
147    // Whiten the data (subtract mean and divide by std)
148    let mut whitened = Array2::<F>::zeros((n_samples, n_features));
149    for i in 0..n_samples {
150        for j in 0..n_features {
151            whitened[[i, j]] = (obs[[i, j]] - means[j]) / stds[j];
152        }
153    }
154
155    Ok(whitened)
156}
157
158/// Assign codes from a code book to observations.
159///
160/// Assigns a code from a code book to each observation. Each
161/// observation vector in the 'M' by 'N' `obs` array is compared with the
162/// centroids in the code book and assigned the code of the closest
163/// centroid.
164///
165/// # Arguments
166///
167/// * `data` - Input data (n_samples × n_features). Each row is an observation.
168/// * `centroids` - The code book (n_centroids x n_features). Each row is a centroid.
169///
170/// # Returns
171///
172/// * Tuple of (labels, distances) where:
173///   - labels: Array of shape (n_samples,) with cluster assignments
174///   - distances: Array of shape (n_samples,) with distances to the closest centroid
175///
176/// # Errors
177///
178/// * Returns an error if the dimensions of data and centroids don't match
179#[allow(dead_code)]
180pub fn vq<F>(data: ArrayView2<F>, centroids: ArrayView2<F>) -> Result<(Array1<usize>, Array1<F>)>
181where
182    F: Float + FromPrimitive + Debug,
183{
184    if data.shape()[1] != centroids.shape()[1] {
185        return Err(ClusteringError::InvalidInput(format!(
186            "Observation array and centroid array must have the same number of dimensions. Got {} and {}",
187            data.shape()[1], centroids.shape()[1]
188        )));
189    }
190
191    let n_samples = data.shape()[0];
192    let n_centroids = centroids.shape()[0];
193
194    let mut labels = Array1::zeros(n_samples);
195    let mut distances = Array1::zeros(n_samples);
196
197    for i in 0..n_samples {
198        let point = data.slice(s![i, ..]);
199        let mut min_dist = F::infinity();
200        let mut closest_centroid = 0;
201
202        for j in 0..n_centroids {
203            let centroid = centroids.slice(s![j, ..]);
204            let dist = euclidean_distance(point, centroid);
205
206            if dist < min_dist {
207                min_dist = dist;
208                closest_centroid = j;
209            }
210        }
211
212        labels[i] = closest_centroid;
213        distances[i] = min_dist;
214    }
215
216    Ok((labels, distances))
217}