1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
use crate::GeoFloat;
use rand::distr::weighted::Error;
/// Errors that can occur during k-means++ initialisation
#[derive(Debug, Clone, PartialEq)]
pub enum KMeansInitError {
/// Input data contains `NaN` coordinates
///
/// This typically indicates invalid or corrupted input data. Check your point coordinates
/// for NaN values before running k-means.
NaNCoordinate,
/// Input data contains infinite coordinates
///
/// This typically indicates invalid or corrupted input data. Check your point coordinates
/// for infinite values before running k-means.
InfiniteCoordinate,
/// All points are at identical locations, making clustering impossible
///
/// When all points have the same coordinates, k-means++ cannot select diverse initial
/// centroids because all distances are zero. This makes the weighted sampling distribution
/// degenerate.
///
/// Consider:
/// - Checking if your data has been properly loaded
/// - Adding small random perturbations to break ties if appropriate.
DegenerateData,
/// Failed to create weighted distribution for k-means++ sampling
///
/// This error occurs when the random weighted sampling distribution cannot be constructed,
/// typically because all weights sum to zero or contain invalid values.
WeightedDistributionFailed {
/// The underlying error from rand's WeightedIndex
error: Error,
},
}
impl std::fmt::Display for KMeansInitError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
KMeansInitError::NaNCoordinate => {
write!(f, "Input data contains NaN coordinates")
}
KMeansInitError::InfiniteCoordinate => {
write!(f, "Input data contains infinite coordinates")
}
KMeansInitError::DegenerateData => {
write!(
f,
"All points are at identical locations, making clustering impossible"
)
}
KMeansInitError::WeightedDistributionFailed { error } => {
write!(
f,
"Failed to create weighted distribution for k-means++ sampling: {}",
error
)
}
}
}
}
/// Errors that can occur during k-means clustering
#[derive(Debug, Clone, PartialEq)]
pub enum KMeansError<T: GeoFloat> {
/// Invalid number of clusters requested
InvalidK {
/// The requested number of clusters
k: usize,
/// The number of points in the dataset
n: usize,
},
/// An empty cluster was encountered and could not be recovered
EmptyCluster {
/// The iteration number when the error occurred
iteration: usize,
/// The cluster ID that became empty
cluster_id: usize,
},
/// Initialisation failed due to degenerate or invalid data
///
/// Contains specific information about the initialisation failure mode.
/// See [`KMeansInitError`] for details on each failure type.
InitializationFailed(KMeansInitError),
/// Maximum iterations reached without convergence
///
/// The algorithm completed `iterations` iterations without converging (no assignment
/// changes and centroid movement below tolerance). The cluster assignments from the
/// final iteration are included and may still be useful.
///
/// ## Interpreting the Results
///
/// Consider using the partial result if:
/// - `max_centroid_shift` is relatively small compared to your data scale
/// - `changed_assignments` is a small fraction of total points
/// - Visual inspection or validation metrics are acceptable
///
/// Consider increasing `max_iter` if:
/// - `max_centroid_shift` is large relative to tolerance
/// - A significant number of points are still changing clusters
/// - You require guaranteed convergence
MaxIterationsReached {
/// Cluster assignments from the final iteration.
///
/// These are valid assignments (each point assigned to nearest centroid)
/// but may improve with additional iterations.
assignments: Vec<usize>,
/// Number of iterations completed (equal to max_iter)
iterations: usize,
/// Maximum distance any centroid shifted in the final iteration
max_centroid_shift: T,
/// The tolerance threshold for convergence
tolerance: T,
/// Number of points that changed clusters in the final iteration
changed_assignments: usize,
},
}
impl<T: GeoFloat> std::fmt::Display for KMeansError<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
KMeansError::InvalidK { k, n } => {
write!(
f,
"Invalid k={}: must be > 0 and <= {} (number of points)",
k, n
)
}
KMeansError::EmptyCluster {
iteration,
cluster_id,
} => {
write!(
f,
"Empty cluster {} at iteration {} could not be recovered",
cluster_id, iteration
)
}
KMeansError::InitializationFailed(init_error) => {
write!(f, "K-means initialisation failed: {}", init_error)
}
KMeansError::MaxIterationsReached {
iterations,
max_centroid_shift,
tolerance,
changed_assignments,
..
} => {
let shift_f64 = max_centroid_shift
.to_f64()
.expect("max_centroid_shift must be representable as f64");
let tol_f64 = tolerance
.to_f64()
.expect("tolerance must be representable as f64");
write!(
f,
"k-means did not converge within {} iterations. \
Final centroid shift: {:.6} (tolerance: {:.6}), \
{} points changed clusters in final iteration. \
Consider increasing max_iter or using the partial result.",
iterations, shift_f64, tol_f64, changed_assignments
)
}
}
}
}
impl<T: GeoFloat> std::error::Error for KMeansError<T> {}