1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
#![cfg_attr(coverage_nightly, coverage(off))]
// Clustering engine implementation for Code Embeddings
// PMAT-SEARCH-007: K-means, Hierarchical, and DBSCAN clustering
use super::super::TursoVectorDB;
use super::types::{
ClusterFilters, ClusterResult, ClusteringMethod, Dendrogram, DendrogramMerge, Linkage,
};
use aprender::prelude::*;
use std::collections::HashMap;
use std::sync::Arc;
/// Clustering engine
pub struct ClusteringEngine {
#[allow(dead_code)] // Reserved for future clustering Phase 2 integration
vector_db: Arc<TursoVectorDB>,
}
impl ClusteringEngine {
/// Create new clustering engine
pub fn new(vector_db: Arc<TursoVectorDB>) -> Self {
Self { vector_db }
}
/// Perform clustering
pub async fn cluster(
&self,
method: ClusteringMethod,
_filters: ClusterFilters,
) -> Result<ClusterResult, String> {
// For now, return empty result
let method_name = match method {
ClusteringMethod::KMeans { .. } => "kmeans",
ClusteringMethod::Hierarchical { .. } => "hierarchical",
ClusteringMethod::DBSCAN { .. } => "dbscan",
};
Ok(ClusterResult {
method: method_name.to_string(),
clusters: Vec::new(),
outliers: Vec::new(),
silhouette_score: 0.0,
total_chunks: 0,
})
}
/// Convert Vec<Vec<f32>> to aprender Matrix
/// Helper for Phase 2 migration to aprender
pub(crate) fn vectors_to_matrix(vectors: &[Vec<f32>]) -> Result<Matrix<f32>, String> {
if vectors.is_empty() {
return Err("Cannot convert empty vector set".to_string());
}
let rows = vectors.len();
let cols = vectors[0].len();
// Flatten to 1D vector for Matrix::from_vec
let data: Vec<f32> = vectors.iter().flat_map(|v| v.iter().copied()).collect();
Matrix::from_vec(rows, cols, data).map_err(|e| format!("Matrix conversion error: {e:?}"))
}
/// K-means clustering implementation
///
/// # Arguments
/// * `vectors` - Array of vectors to cluster
/// * `k` - Number of clusters
/// * `max_iterations` - Maximum iterations before stopping
///
/// # Returns
/// Array of cluster labels (0 to k-1)
pub fn kmeans(
&self,
vectors: &[Vec<f32>],
k: usize,
max_iterations: usize,
) -> Result<Vec<usize>, String> {
// Phase 2: Use aprender for KMeans clustering (replaced custom implementation)
if vectors.is_empty() {
return Err("Cannot cluster empty vector set".to_string());
}
if k == 0 {
return Err("k must be greater than 0".to_string());
}
if k > vectors.len() {
return Err("Cannot have more clusters than points".to_string());
}
// Special case: single cluster
if k == 1 {
return Ok(vec![0; vectors.len()]);
}
// Convert to aprender Matrix
let matrix = Self::vectors_to_matrix(vectors)?;
// Use aprender KMeans
let mut kmeans = KMeans::new(k).with_max_iter(max_iterations);
kmeans
.fit(&matrix)
.map_err(|e| format!("KMeans fit error: {e:?}"))?;
let labels = kmeans.predict(&matrix);
Ok(labels)
}
/// K-means with seed for deterministic results
pub fn kmeans_with_seed(
&self,
vectors: &[Vec<f32>],
k: usize,
max_iterations: usize,
seed: u64,
) -> Result<Vec<usize>, String> {
// Phase 2: Use aprender with seed for deterministic results
if vectors.is_empty() {
return Err("Cannot cluster empty vector set".to_string());
}
if k == 0 {
return Err("k must be greater than 0".to_string());
}
if k > vectors.len() {
return Err("Cannot have more clusters than points".to_string());
}
// Special case: single cluster
if k == 1 {
return Ok(vec![0; vectors.len()]);
}
// Convert to aprender Matrix
let matrix = Self::vectors_to_matrix(vectors)?;
// Use aprender KMeans with seed
let mut kmeans = KMeans::new(k)
.with_max_iter(max_iterations)
.with_random_state(seed);
kmeans
.fit(&matrix)
.map_err(|e| format!("KMeans fit error: {e:?}"))?;
let labels = kmeans.predict(&matrix);
Ok(labels)
}
/// Hierarchical clustering
///
/// Note: This uses a custom implementation (not aprender) because it returns
/// a Dendrogram structure showing the merge history, which is useful for
/// visualization. aprender's HierarchicalClustering returns cluster labels only.
pub fn hierarchical(
&self,
vectors: &[Vec<f32>],
linkage: Linkage,
) -> Result<Dendrogram, String> {
if vectors.is_empty() {
return Err("Cannot cluster empty vector set".to_string());
}
let n = vectors.len();
// Guard: O(n²) distance matrix + O(n³) agglomerative merge
// Cap at 5000 to prevent memory exhaustion and timeouts (PMAT-505)
const MAX_HIERARCHICAL_SIZE: usize = 5000;
if n > MAX_HIERARCHICAL_SIZE {
return Err(format!(
"Hierarchical clustering input too large: {n} vectors (max: {MAX_HIERARCHICAL_SIZE}). \
Use k-means for large datasets."
));
}
let mut merges = Vec::new();
let mut clusters: Vec<Vec<usize>> = (0..n).map(|i| vec![i]).collect();
// Compute initial distance matrix
let mut distances = HashMap::new();
for i in 0..n {
for j in (i + 1)..n {
let dist = self.euclidean_distance(&vectors[i], &vectors[j]);
distances.insert((i, j), dist);
}
}
// Agglomerative clustering
while clusters.len() > 1 {
// Find closest pair
let mut min_dist = f64::MAX;
let mut min_i = 0;
let mut min_j = 1;
for i in 0..clusters.len() {
for j in (i + 1)..clusters.len() {
let dist = self.cluster_distance(
&clusters[i],
&clusters[j],
&distances,
vectors,
linkage,
);
if dist < min_dist {
min_dist = dist;
min_i = i;
min_j = j;
}
}
}
// Merge clusters
merges.push(DendrogramMerge {
cluster1: min_i,
cluster2: min_j,
distance: min_dist,
});
let mut merged = clusters[min_i].clone();
merged.extend(&clusters[min_j]);
// Remove merged clusters and add new one
clusters.remove(min_j);
clusters.remove(min_i);
clusters.push(merged);
}
Ok(Dendrogram { merges })
}
/// Compute distance between two clusters
pub(crate) fn cluster_distance(
&self,
cluster1: &[usize],
cluster2: &[usize],
distances: &HashMap<(usize, usize), f64>,
_vectors: &[Vec<f32>],
linkage: Linkage,
) -> f64 {
let mut dists = Vec::new();
for &i in cluster1 {
for &j in cluster2 {
let key = if i < j { (i, j) } else { (j, i) };
if let Some(&dist) = distances.get(&key) {
dists.push(dist);
}
}
}
if dists.is_empty() {
return f64::MAX;
}
match linkage {
Linkage::Single => *dists
.iter()
.min_by(|a, b| a.total_cmp(b))
.expect("internal error"),
Linkage::Complete => *dists
.iter()
.max_by(|a, b| a.total_cmp(b))
.expect("internal error"),
Linkage::Average => dists.iter().sum::<f64>() / dists.len() as f64,
}
}
/// DBSCAN clustering
///
/// # Arguments
/// * `vectors` - Array of vectors to cluster
/// * `epsilon` - Maximum distance for neighborhood
/// * `min_samples` - Minimum points for core point
///
/// # Returns
/// Array of cluster labels (-1 for noise, 0+ for clusters)
pub fn dbscan(
&self,
vectors: &[Vec<f32>],
epsilon: f64,
min_samples: usize,
) -> Result<Vec<i32>, String> {
// Phase 2: Use aprender for DBSCAN clustering (replaced custom implementation)
if vectors.is_empty() {
return Err("Cannot cluster empty vector set".to_string());
}
// Convert to aprender Matrix
let matrix = Self::vectors_to_matrix(vectors)?;
// Use aprender DBSCAN (cast epsilon to f32 for aprender API)
let mut dbscan = DBSCAN::new(epsilon as f32, min_samples);
dbscan
.fit(&matrix)
.map_err(|e| format!("DBSCAN fit error: {e:?}"))?;
let labels = dbscan.predict(&matrix);
Ok(labels)
}
/// Compute silhouette score for clustering quality
pub fn compute_silhouette_score(&self, vectors: &[Vec<f32>], labels: &[usize]) -> f64 {
if vectors.is_empty() || labels.is_empty() {
return 0.0;
}
let n = vectors.len();
let mut silhouette_sum = 0.0;
for i in 0..n {
let a = self.intra_cluster_distance(vectors, labels, i);
let b = self.nearest_cluster_distance(vectors, labels, i);
let silhouette = if a < b {
1.0 - (a / b)
} else if a > b {
(b / a) - 1.0
} else {
0.0
};
silhouette_sum += silhouette;
}
silhouette_sum / n as f64
}
/// Average distance to points in same cluster
pub(crate) fn intra_cluster_distance(
&self,
vectors: &[Vec<f32>],
labels: &[usize],
point_idx: usize,
) -> f64 {
let cluster_label = labels[point_idx];
let mut sum = 0.0;
let mut count = 0;
for (i, &label) in labels.iter().enumerate() {
if label == cluster_label && i != point_idx {
sum += self.euclidean_distance(&vectors[point_idx], &vectors[i]);
count += 1;
}
}
if count == 0 {
0.0
} else {
sum / count as f64
}
}
/// Average distance to nearest cluster
pub(crate) fn nearest_cluster_distance(
&self,
vectors: &[Vec<f32>],
labels: &[usize],
point_idx: usize,
) -> f64 {
let current_cluster = labels[point_idx];
let mut min_avg_dist = f64::MAX;
// Find all unique clusters
let mut clusters: Vec<usize> = labels.to_vec();
clusters.sort();
clusters.dedup();
for &cluster_label in &clusters {
if cluster_label == current_cluster {
continue;
}
let mut sum = 0.0;
let mut count = 0;
for (i, &label) in labels.iter().enumerate() {
if label == cluster_label {
sum += self.euclidean_distance(&vectors[point_idx], &vectors[i]);
count += 1;
}
}
if count > 0 {
let avg_dist = sum / count as f64;
if avg_dist < min_avg_dist {
min_avg_dist = avg_dist;
}
}
}
min_avg_dist
}
/// Compute Euclidean distance between two vectors
pub(crate) fn euclidean_distance(&self, v1: &[f32], v2: &[f32]) -> f64 {
if v1.len() != v2.len() {
return f64::MAX;
}
let sum: f32 = v1
.iter()
.zip(v2.iter())
.map(|(a, b)| (a - b) * (a - b))
.sum();
(sum as f64).sqrt()
}
}