1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
//! Simple k-means clustering working on [`Matrix`] data.
//!
//! ```
//! use rustframe::compute::models::k_means::KMeans;
//! use rustframe::matrix::Matrix;
//!
//! let data = Matrix::from_vec(vec![1.0, 1.0, 5.0, 5.0], 2, 2);
//! let (model, labels) = KMeans::fit(&data, 2, 10, 1e-4);
//! assert_eq!(model.centroids.rows(), 2);
//! assert_eq!(labels.len(), 2);
//! ```
use crate::compute::stats::mean_vertical;
use crate::matrix::Matrix;
use crate::random::prelude::*;
pub struct KMeans {
pub centroids: Matrix<f64>, // (k, n_features)
}
impl KMeans {
/// Fit with k clusters.
pub fn fit(x: &Matrix<f64>, k: usize, max_iter: usize, tol: f64) -> (Self, Vec<usize>) {
let m = x.rows();
let n = x.cols();
assert!(k <= m, "k must be ≤ number of samples");
// ----- initialise centroids -----
let mut centroids = Matrix::zeros(k, n);
if k > 0 && m > 0 {
// case for empty data
if k == 1 {
let mean = mean_vertical(x);
centroids.row_copy_from_slice(0, &mean.data()); // ideally, data.row(0), but thats the same
} else {
// For k > 1, pick k distinct rows at random
let mut rng = rng();
let mut indices: Vec<usize> = (0..m).collect();
indices.shuffle(&mut rng);
for c in 0..k {
centroids.row_copy_from_slice(c, &x.row(indices[c]));
}
}
}
let mut labels = vec![0usize; m];
let mut distances = vec![0.0f64; m];
for _iter in 0..max_iter {
let mut changed = false;
// ----- assignment step -----
for i in 0..m {
let sample_row = x.row(i);
let mut best = 0usize;
let mut best_dist_sq = f64::MAX;
for c in 0..k {
let centroid_row = centroids.row(c);
let dist_sq: f64 = sample_row
.iter()
.zip(centroid_row.iter())
.map(|(a, b)| (a - b).powi(2))
.sum();
if dist_sq < best_dist_sq {
best_dist_sq = dist_sq;
best = c;
}
}
distances[i] = best_dist_sq;
if labels[i] != best {
labels[i] = best;
changed = true;
}
}
// ----- update step -----
let mut new_centroids = Matrix::zeros(k, n);
let mut counts = vec![0usize; k];
for i in 0..m {
let c = labels[i];
counts[c] += 1;
for j in 0..n {
new_centroids[(c, j)] += x[(i, j)];
}
}
for c in 0..k {
if counts[c] == 0 {
// This cluster is empty. Re-initialize its centroid to the point
// furthest from its assigned centroid to prevent the cluster from dying.
let mut furthest_point_idx = 0;
let mut max_dist_sq = 0.0;
for (i, &dist) in distances.iter().enumerate() {
if dist > max_dist_sq {
max_dist_sq = dist;
furthest_point_idx = i;
}
}
for j in 0..n {
new_centroids[(c, j)] = x[(furthest_point_idx, j)];
}
// Ensure this point isn't chosen again for another empty cluster in the same iteration.
if m > 0 {
distances[furthest_point_idx] = 0.0;
}
} else {
// Normalize the centroid by the number of points in it.
for j in 0..n {
new_centroids[(c, j)] /= counts[c] as f64;
}
}
}
// ----- convergence test -----
if !changed {
centroids = new_centroids; // update before breaking
break; // assignments stable
}
let diff = &new_centroids - ¢roids;
centroids = new_centroids; // Update for the next iteration
if tol > 0.0 {
let sq_diff = &diff * &diff;
let shift = sq_diff.data().iter().sum::<f64>().sqrt();
if shift < tol {
break;
}
}
}
(Self { centroids }, labels)
}
/// Predict nearest centroid for each sample.
pub fn predict(&self, x: &Matrix<f64>) -> Vec<usize> {
let m = x.rows();
let k = self.centroids.rows();
if m == 0 {
return Vec::new();
}
let mut labels = vec![0usize; m];
for i in 0..m {
let sample_row = x.row(i);
let mut best = 0usize;
let mut best_dist_sq = f64::MAX;
for c in 0..k {
let centroid_row = self.centroids.row(c);
let dist_sq: f64 = sample_row
.iter()
.zip(centroid_row.iter())
.map(|(a, b)| (a - b).powi(2))
.sum();
if dist_sq < best_dist_sq {
best_dist_sq = dist_sq;
best = c;
}
}
labels[i] = best;
}
labels
}
}
#[cfg(test)]
mod tests {
#[test]
fn test_k_means_empty_cluster_reinit_centroid() {
// Try multiple times to increase the chance of hitting the empty cluster case
for _ in 0..20 {
let data = vec![0.0, 0.0, 0.0, 0.0, 10.0, 10.0];
let x = FloatMatrix::from_rows_vec(data, 3, 2);
let k = 2;
let max_iter = 10;
let tol = 1e-6;
let (kmeans_model, labels) = KMeans::fit(&x, k, max_iter, tol);
// Check if any cluster is empty
let mut counts = vec![0; k];
for &label in &labels {
counts[label] += 1;
}
if counts.iter().any(|&c| c == 0) {
// Only check the property for clusters that are empty
let centroids = kmeans_model.centroids;
for c in 0..k {
if counts[c] == 0 {
let mut matches_data_point = false;
for i in 0..3 {
let dx = centroids[(c, 0)] - x[(i, 0)];
let dy = centroids[(c, 1)] - x[(i, 1)];
if dx.abs() < 1e-9 && dy.abs() < 1e-9 {
matches_data_point = true;
break;
}
}
// "Centroid {} (empty cluster) does not match any data point",c
assert!(matches_data_point);
}
}
break;
}
}
// If we never saw an empty cluster, that's fine; the test passes as long as no panic occurred
}
use super::*;
use crate::matrix::FloatMatrix;
fn create_test_data() -> (FloatMatrix, usize) {
// Simple 2D data for testing K-Means
// Cluster 1: (1,1), (1.5,1.5)
// Cluster 2: (5,8), (8,8), (6,7)
let data = vec![
1.0, 1.0, // Sample 0
1.5, 1.5, // Sample 1
5.0, 8.0, // Sample 2
8.0, 8.0, // Sample 3
6.0, 7.0, // Sample 4
];
let x = FloatMatrix::from_rows_vec(data, 5, 2);
let k = 2;
(x, k)
}
// Helper for single cluster test with exact mean
fn create_simple_integer_data() -> FloatMatrix {
// Data points: (1,1), (2,2), (3,3)
FloatMatrix::from_rows_vec(vec![1.0, 1.0, 2.0, 2.0, 3.0, 3.0], 3, 2)
}
#[test]
fn test_k_means_fit_predict_basic() {
let (x, k) = create_test_data();
let max_iter = 100;
let tol = 1e-6;
let (kmeans_model, labels) = KMeans::fit(&x, k, max_iter, tol);
// Assertions for fit
assert_eq!(kmeans_model.centroids.rows(), k);
assert_eq!(kmeans_model.centroids.cols(), x.cols());
assert_eq!(labels.len(), x.rows());
// Check if labels are within expected range (0 to k-1)
for &label in &labels {
assert!(label < k);
}
// Predict with the same data
let predicted_labels = kmeans_model.predict(&x);
// The exact labels might vary due to random initialization,
// but the clustering should be consistent.
// We expect two clusters. Let's check if samples 0,1 are in one cluster
// and samples 2,3,4 are in another.
let cluster_0_members = vec![labels[0], labels[1]];
let cluster_1_members = vec![labels[2], labels[3], labels[4]];
// All members of cluster 0 should have the same label
assert_eq!(cluster_0_members[0], cluster_0_members[1]);
// All members of cluster 1 should have the same label
assert_eq!(cluster_1_members[0], cluster_1_members[1]);
assert_eq!(cluster_1_members[0], cluster_1_members[2]);
// The two clusters should have different labels
assert_ne!(cluster_0_members[0], cluster_1_members[0]);
// Check predicted labels are consistent with fitted labels
assert_eq!(labels, predicted_labels);
// Test with a new sample
let new_sample_data = vec![1.2, 1.3]; // Should be close to cluster 0
let new_sample = FloatMatrix::from_rows_vec(new_sample_data, 1, 2);
let new_sample_label = kmeans_model.predict(&new_sample)[0];
assert_eq!(new_sample_label, cluster_0_members[0]);
let new_sample_data_2 = vec![7.0, 7.5]; // Should be close to cluster 1
let new_sample_2 = FloatMatrix::from_rows_vec(new_sample_data_2, 1, 2);
let new_sample_label_2 = kmeans_model.predict(&new_sample_2)[0];
assert_eq!(new_sample_label_2, cluster_1_members[0]);
}
#[test]
fn test_k_means_fit_k_equals_m() {
// Test case where k (number of clusters) equals m (number of samples)
let (x, _) = create_test_data(); // 5 samples
let k = 5; // 5 clusters
let max_iter = 10;
let tol = 1e-6;
let (kmeans_model, labels) = KMeans::fit(&x, k, max_iter, tol);
assert_eq!(kmeans_model.centroids.rows(), k);
assert_eq!(labels.len(), x.rows());
// Each sample should be its own cluster. Due to random init, labels
// might not be [0,1,2,3,4] but will be a permutation of it.
let mut sorted_labels = labels.clone();
sorted_labels.sort_unstable();
sorted_labels.dedup();
// Labels should all be unique when k==m
assert_eq!(sorted_labels.len(), k);
}
#[test]
#[should_panic(expected = "k must be ≤ number of samples")]
fn test_k_means_fit_k_greater_than_m() {
let (x, _) = create_test_data(); // 5 samples
let k = 6; // k > m
let max_iter = 10;
let tol = 1e-6;
let (_kmeans_model, _labels) = KMeans::fit(&x, k, max_iter, tol);
}
#[test]
fn test_k_means_fit_single_cluster() {
// Test with k=1
let x = create_simple_integer_data(); // Use integer data
let k = 1;
let max_iter = 100;
let tol = 1e-6;
let (kmeans_model, labels) = KMeans::fit(&x, k, max_iter, tol);
assert_eq!(kmeans_model.centroids.rows(), 1);
assert_eq!(labels.len(), x.rows());
// All labels should be 0
assert!(labels.iter().all(|&l| l == 0));
// Centroid should be the mean of all data points
let expected_centroid_x = x.column(0).iter().sum::<f64>() / x.rows() as f64;
let expected_centroid_y = x.column(1).iter().sum::<f64>() / x.rows() as f64;
assert!((kmeans_model.centroids[(0, 0)] - expected_centroid_x).abs() < 1e-9);
assert!((kmeans_model.centroids[(0, 1)] - expected_centroid_y).abs() < 1e-9);
}
#[test]
fn test_k_means_predict_empty_matrix() {
let (x, k) = create_test_data();
let max_iter = 10;
let tol = 1e-6;
let (kmeans_model, _labels) = KMeans::fit(&x, k, max_iter, tol);
// The `Matrix` type not support 0xN or Nx0 matrices.
// test with a 0x0 matrix is a valid edge case.
let empty_x = FloatMatrix::from_rows_vec(vec![], 0, 0);
let predicted_labels = kmeans_model.predict(&empty_x);
assert!(predicted_labels.is_empty());
}
#[test]
fn test_k_means_predict_single_sample() {
let (x, k) = create_test_data();
let max_iter = 10;
let tol = 1e-6;
let (kmeans_model, _labels) = KMeans::fit(&x, k, max_iter, tol);
let single_sample = FloatMatrix::from_rows_vec(vec![1.1, 1.2], 1, 2);
let predicted_label = kmeans_model.predict(&single_sample);
assert_eq!(predicted_label.len(), 1);
assert!(predicted_label[0] < k);
}
}