1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
//! Property-based tests for clustering algorithm Python bindings
//!
//! This module contains property-based tests to ensure the robustness
//! and correctness of clustering algorithm implementations.
use proptest::prelude::*;
use scirs2_autograd::ndarray::{Array1, Array2};
use scirs2_core::random::thread_rng;
use std::collections::HashSet;
#[allow(non_snake_case)]
#[cfg(test)]
mod clustering_properties {
use super::*;
// Property: Cluster labels should be valid integers within expected range
proptest! {
#[test]
fn test_cluster_label_validity(
n_samples in 10..100usize,
n_clusters in 2..10usize,
) {
prop_assume!(n_clusters <= n_samples);
// Generate random labels as if from a clustering algorithm
let mut rng = thread_rng();
let labels: Vec<i32> = (0..n_samples)
.map(|_| (rng.random::<f64>() * n_clusters as f64).floor() as i32)
.collect();
// All labels should be non-negative
prop_assert!(labels.iter().all(|&label| label >= 0));
// Labels should not exceed the number of clusters
prop_assert!(labels.iter().all(|&label| label < n_clusters as i32));
// Should have the correct number of labels
prop_assert_eq!(labels.len(), n_samples);
}
}
// Property: Distance calculations should satisfy metric properties
proptest! {
#[test]
fn test_euclidean_distance_properties(
n_features in 1..10usize,
) {
let mut rng = thread_rng();
let point_a = Array1::from_shape_fn(n_features, |_| rng.random::<f64>());
let point_b = Array1::from_shape_fn(n_features, |_| rng.random::<f64>());
let point_c = Array1::from_shape_fn(n_features, |_| rng.random::<f64>());
// Calculate Euclidean distances
let dist_ab = (&point_a - &point_b).mapv(|x| x * x).sum().sqrt();
let dist_ba = (&point_b - &point_a).mapv(|x| x * x).sum().sqrt();
let dist_aa = (&point_a - &point_a).mapv(|x| x * x).sum().sqrt();
let dist_ac = (&point_a - &point_c).mapv(|x| x * x).sum().sqrt();
let dist_bc = (&point_b - &point_c).mapv(|x| x * x).sum().sqrt();
// Test metric properties
// 1. Non-negativity: d(a,b) >= 0
prop_assert!(dist_ab >= 0.0);
// 2. Identity: d(a,a) = 0
prop_assert!((dist_aa).abs() < 1e-10);
// 3. Symmetry: d(a,b) = d(b,a)
prop_assert!((dist_ab - dist_ba).abs() < 1e-10);
// 4. Triangle inequality: d(a,c) <= d(a,b) + d(b,c)
prop_assert!(dist_ac <= dist_ab + dist_bc + 1e-10); // Allow for floating point errors
}
}
// Property: K-means cluster assignments should be stable for identical inputs
proptest! {
#[test]
fn test_kmeans_determinism(
n_samples in 10..50usize,
n_features in 2..5usize,
n_clusters in 2..5usize,
) {
prop_assume!(n_clusters <= n_samples);
let mut rng = thread_rng();
let data = Array2::from_shape_fn((n_samples, n_features), |_| rng.random::<f64>());
// Generate cluster centers
let centers = Array2::from_shape_fn((n_clusters, n_features), |_| rng.random::<f64>());
// Assign each point to closest cluster (simplified K-means assignment)
let mut assignments1 = Vec::new();
let mut assignments2 = Vec::new();
for i in 0..n_samples {
let point = data.row(i);
let mut closest_cluster = 0;
let mut min_distance = f64::INFINITY;
for j in 0..n_clusters {
let center = centers.row(j);
let distance = (&point.to_owned() - ¢er.to_owned()).mapv(|x| x * x).sum().sqrt();
if distance < min_distance {
min_distance = distance;
closest_cluster = j;
}
}
assignments1.push(closest_cluster as i32);
assignments2.push(closest_cluster as i32);
}
// Assignments should be identical for identical inputs
prop_assert_eq!(assignments1, assignments2);
}
}
// Property: Number of unique clusters should not exceed the specified number
proptest! {
#[test]
fn test_cluster_count_validity(
n_samples in 10..100usize,
n_clusters in 2..10usize,
) {
prop_assume!(n_clusters <= n_samples);
let mut rng = thread_rng();
let labels: Vec<i32> = (0..n_samples)
.map(|_| (rng.random::<f64>() * n_clusters as f64).floor() as i32)
.collect();
let unique_labels: HashSet<i32> = labels.into_iter().collect();
// Number of unique clusters should not exceed n_clusters
prop_assert!(unique_labels.len() <= n_clusters);
// All unique labels should be non-negative
prop_assert!(unique_labels.iter().all(|&label| label >= 0));
}
}
// Property: Cluster centers should be meaningful for their assigned points
proptest! {
#[test]
fn test_cluster_center_properties(
n_features in 1..5usize,
cluster_size in 5..20usize,
) {
let mut rng = thread_rng();
// Generate points around a center
let true_center = Array1::from_shape_fn(n_features, |_| rng.random::<f64>());
let mut points = Vec::new();
for _ in 0..cluster_size {
let noise = Array1::from_shape_fn(n_features, |_| (rng.random::<f64>() - 0.5) * 0.1);
let point = &true_center + &noise;
points.push(point);
}
// Calculate empirical center (mean of points)
let mut empirical_center = Array1::zeros(n_features);
for point in &points {
empirical_center = &empirical_center + point;
}
empirical_center /= cluster_size as f64;
// Empirical center should be close to true center
let center_distance = (&empirical_center - &true_center).mapv(|x: f64| -> f64 { x * x }).sum().sqrt();
prop_assert!(center_distance < 1.0); // Should be reasonably close given small noise
}
}
// Property: DBSCAN noise points should be labeled as -1
proptest! {
#[test]
fn test_dbscan_noise_labeling(
n_samples in 10..50usize,
noise_ratio in 0.0f64..0.5,
) {
// Simulate DBSCAN labels with some noise points
let _rng = thread_rng();
let n_noise = (n_samples as f64 * noise_ratio) as usize;
let n_clustered = n_samples - n_noise;
let mut labels = Vec::new();
// Add clustered points (labels 0, 1, 2, ...)
for i in 0..n_clustered {
labels.push((i % 3) as i32); // Distribute among 3 clusters
}
// Add noise points (label -1)
for _ in 0..n_noise {
labels.push(-1);
}
let noise_count = labels.iter().filter(|&&label| label == -1).count();
let cluster_count = labels.iter().filter(|&&label| label >= 0).count();
prop_assert_eq!(noise_count, n_noise);
prop_assert_eq!(cluster_count, n_clustered);
prop_assert_eq!(noise_count + cluster_count, n_samples);
}
}
}
#[allow(non_snake_case)]
#[cfg(test)]
mod clustering_validation_properties {
use super::*;
// Property: Input data should have consistent dimensions
proptest! {
#[test]
fn test_clustering_input_validation(
n_samples in 5..100usize,
n_features in 1..10usize,
) {
let mut rng = thread_rng();
let data = Array2::from_shape_fn((n_samples, n_features), |_| rng.random::<f64>());
// Basic consistency checks
prop_assert_eq!(data.nrows(), n_samples);
prop_assert_eq!(data.ncols(), n_features);
prop_assert!(data.iter().all(|&val| val.is_finite()));
}
}
// Property: Invalid cluster numbers should be handled appropriately
proptest! {
#[test]
fn test_invalid_cluster_parameters(
n_samples in 5..20usize,
invalid_n_clusters in 0usize..1,
) {
// Test that invalid cluster numbers (0 or negative) are detected
// In practice, these should cause validation errors
prop_assert!(invalid_n_clusters < 1);
prop_assert!(n_samples >= 5); // Ensure we have sufficient samples
// The actual clustering algorithm should reject invalid_n_clusters
}
}
}