1use scirs2_core::ndarray::{ArrayBase, Data, Dimension, Ix1, Ix2};
8use scirs2_core::numeric::{Float, NumCast};
9use std::collections::HashMap;
10use std::ops::{AddAssign, DivAssign};
11
12use crate::error::{MetricsError, Result};
13
14#[allow(dead_code)]
46pub fn local_density_factor<F, S1, S2, D>(
47 x: &ArrayBase<S1, Ix2>,
48 labels: &ArrayBase<S2, D>,
49 k: Option<usize>,
50) -> Result<HashMap<usize, F>>
51where
52 F: Float
53 + NumCast
54 + std::fmt::Debug
55 + scirs2_core::ndarray::ScalarOperand
56 + AddAssign
57 + DivAssign,
58 S1: Data<Elem = F>,
59 S2: Data<Elem = usize>,
60 D: Dimension,
61{
62 let n_samples = x.shape()[0];
64 if n_samples != labels.len() {
65 return Err(MetricsError::InvalidInput(format!(
66 "x has {} samples, but labels has {} samples",
67 n_samples,
68 labels.len()
69 )));
70 }
71
72 let k = k.unwrap_or_else(|| {
74 if n_samples <= 1 {
75 1
76 } else if n_samples < 10 {
77 std::cmp::min(2, n_samples - 1)
78 } else {
79 std::cmp::min(5, n_samples / 10)
80 }
81 });
82
83 let mut unique_labels = Vec::new();
85 for &label in labels.iter() {
86 if !unique_labels.contains(&label) {
87 unique_labels.push(label);
88 }
89 }
90
91 unique_labels.sort();
93
94 let mut all_knn_distances = Vec::new();
96 let mut cluster_idx = HashMap::new();
97
98 for label in &unique_labels {
99 cluster_idx.insert(*label, Vec::new());
100 }
101
102 for (i, &label) in labels.iter().enumerate() {
104 if let Some(indices) = cluster_idx.get_mut(&label) {
105 indices.push(i);
106 }
107 }
108
109 for i in 0..n_samples {
111 let current_point = x.row(i);
112
113 let mut distances = Vec::new();
115 for j in 0..n_samples {
116 if i != j {
117 let other_point = x.row(j);
118 let dist = calculate_euclidean_distance(¤t_point, &other_point);
119 distances.push(dist);
120 }
121 }
122
123 distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
125 let k_distance = if distances.len() >= k {
126 distances[k - 1] } else if !distances.is_empty() {
128 distances[distances.len() - 1] } else {
130 F::zero() };
132
133 all_knn_distances.push((i, k_distance));
134 }
135
136 let mut ldf = HashMap::new();
138
139 for &label in &unique_labels {
140 let cluster_indices = cluster_idx.get(&label).unwrap();
141 if cluster_indices.is_empty() {
142 continue;
143 }
144
145 let mut cluster_knn_sum = F::zero();
147 let mut count = 0;
148
149 for &idx in cluster_indices {
150 cluster_knn_sum += all_knn_distances[idx].1;
151 count += 1;
152 }
153
154 let avg_knn = if count > 0 {
155 cluster_knn_sum / F::from(count).unwrap()
156 } else {
157 F::zero()
158 };
159
160 let factor = if avg_knn > F::zero() {
163 F::one() / avg_knn
164 } else {
165 F::zero()
166 };
167
168 ldf.insert(label, factor);
169 }
170
171 Ok(ldf)
172}
173
174#[allow(dead_code)]
207pub fn relative_density_index<F, S1, S2, D>(
208 x: &ArrayBase<S1, Ix2>,
209 labels: &ArrayBase<S2, D>,
210 k: Option<usize>,
211) -> Result<F>
212where
213 F: Float
214 + NumCast
215 + std::fmt::Debug
216 + scirs2_core::ndarray::ScalarOperand
217 + AddAssign
218 + DivAssign,
219 S1: Data<Elem = F>,
220 S2: Data<Elem = usize>,
221 D: Dimension,
222{
223 let n_samples = x.shape()[0];
225 if n_samples != labels.len() {
226 return Err(MetricsError::InvalidInput(format!(
227 "x has {} samples, but labels has {} samples",
228 n_samples,
229 labels.len()
230 )));
231 }
232
233 let mut unique_labels = Vec::new();
235 for &label in labels.iter() {
236 if !unique_labels.contains(&label) {
237 unique_labels.push(label);
238 }
239 }
240
241 unique_labels.sort();
243
244 let k = k.unwrap_or_else(|| {
246 if n_samples <= 1 {
247 1
248 } else if n_samples < 10 {
249 std::cmp::min(2, n_samples - 1)
250 } else {
251 std::cmp::min(5, n_samples / 10)
252 }
253 });
254
255 let mut intra_density_sum = F::zero();
257 let mut inter_density_sum = F::zero();
258
259 for (i, &label_i) in labels.iter().enumerate() {
260 let mut intra_distances = Vec::new();
262 let mut inter_distances = Vec::new();
263
264 for (j, &label_j) in labels.iter().enumerate() {
265 if i != j {
266 let dist = calculate_euclidean_distance(&x.row(i), &x.row(j));
267
268 if label_i == label_j {
269 intra_distances.push(dist);
270 } else {
271 inter_distances.push(dist);
272 }
273 }
274 }
275
276 intra_distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
278 let intra_k = std::cmp::min(k, intra_distances.len());
279
280 if intra_k > 0 {
281 let knn_intra_dist = intra_distances[intra_k - 1];
282 let intra_density = if knn_intra_dist > F::zero() {
283 F::one() / knn_intra_dist
284 } else {
285 F::zero()
286 };
287 intra_density_sum += intra_density;
288 }
289
290 inter_distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
292 let inter_k = std::cmp::min(k, inter_distances.len());
293
294 if inter_k > 0 {
295 let knn_inter_dist = inter_distances[inter_k - 1];
296 let inter_density = if knn_inter_dist > F::zero() {
297 F::one() / knn_inter_dist
298 } else {
299 F::zero()
300 };
301 inter_density_sum += inter_density;
302 }
303 }
304
305 let avg_intra_density = if n_samples > 0 {
307 intra_density_sum / F::from(n_samples).unwrap()
308 } else {
309 F::zero()
310 };
311
312 let avg_inter_density = if n_samples > 0 {
313 inter_density_sum / F::from(n_samples).unwrap()
314 } else {
315 F::zero()
316 };
317
318 let rdi = if avg_inter_density > F::zero() {
320 avg_intra_density / avg_inter_density
321 } else if avg_intra_density > F::zero() {
322 F::max_value() } else {
324 F::one() };
326
327 Ok(rdi)
328}
329
330#[allow(dead_code)]
363pub fn density_based_cluster_validity<F, S1, S2, D>(
364 x: &ArrayBase<S1, Ix2>,
365 labels: &ArrayBase<S2, D>,
366 k: Option<usize>,
367) -> Result<F>
368where
369 F: Float
370 + NumCast
371 + std::fmt::Debug
372 + scirs2_core::ndarray::ScalarOperand
373 + AddAssign
374 + DivAssign,
375 S1: Data<Elem = F>,
376 S2: Data<Elem = usize>,
377 D: Dimension,
378{
379 let n_samples = x.shape()[0];
381 if n_samples != labels.len() {
382 return Err(MetricsError::InvalidInput(format!(
383 "x has {} samples, but labels has {} samples",
384 n_samples,
385 labels.len()
386 )));
387 }
388
389 let k = k.unwrap_or_else(|| {
391 if n_samples <= 1 {
392 1
393 } else if n_samples < 10 {
394 std::cmp::min(2, n_samples - 1)
395 } else {
396 std::cmp::min(5, n_samples / 10)
397 }
398 });
399
400 let mut unique_labels = Vec::new();
402 for &label in labels.iter() {
403 if !unique_labels.contains(&label) {
404 unique_labels.push(label);
405 }
406 }
407
408 if unique_labels.len() < 2 {
409 return Err(MetricsError::InvalidInput(
410 "At least two clusters are required to calculate DBCV".to_string(),
411 ));
412 }
413
414 unique_labels.sort();
416
417 let mut cluster_indices = HashMap::new();
419 for label in &unique_labels {
420 cluster_indices.insert(*label, Vec::new());
421 }
422
423 for (i, &label) in labels.iter().enumerate() {
424 if let Some(indices) = cluster_indices.get_mut(&label) {
425 indices.push(i);
426 }
427 }
428
429 let mut sparseness_values = Vec::new();
431
432 for &label in &unique_labels {
433 let indices = cluster_indices.get(&label).unwrap();
434 if indices.len() <= 1 {
435 sparseness_values.push(F::zero());
437 continue;
438 }
439
440 let mut core_distances = Vec::new();
442
443 for &i in indices {
444 let mut distances = Vec::new();
445 for &j in indices {
446 if i != j {
447 let dist = calculate_euclidean_distance(&x.row(i), &x.row(j));
448 distances.push(dist);
449 }
450 }
451
452 distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
453 let k_actual = std::cmp::min(k, distances.len());
454
455 if k_actual > 0 {
456 core_distances.push(distances[k_actual - 1]);
457 } else {
458 core_distances.push(F::zero());
459 }
460 }
461
462 let avg_core_distance = if !core_distances.is_empty() {
464 core_distances.iter().fold(F::zero(), |sum, &val| sum + val)
465 / F::from(core_distances.len()).unwrap()
466 } else {
467 F::zero()
468 };
469
470 let variance = if core_distances.len() > 1 {
472 let mean = avg_core_distance;
473 let sum_squared_diff = core_distances
474 .iter()
475 .fold(F::zero(), |sum, &val| sum + (val - mean) * (val - mean));
476 sum_squared_diff / F::from(core_distances.len() - 1).unwrap()
477 } else {
478 F::zero()
479 };
480
481 let sparseness = avg_core_distance * (F::one() + variance.sqrt());
483 sparseness_values.push(sparseness);
484 }
485
486 let mut separation_matrix = vec![vec![F::zero(); unique_labels.len()]; unique_labels.len()];
488
489 for (i, &label_i) in unique_labels.iter().enumerate() {
490 let indices_i = cluster_indices.get(&label_i).unwrap();
491
492 for (j, &label_j) in unique_labels.iter().enumerate() {
493 if i == j {
494 continue;
495 }
496
497 let indices_j = cluster_indices.get(&label_j).unwrap();
498
499 let mut min_dist = F::max_value();
501
502 for &idx_i in indices_i {
503 for &idx_j in indices_j {
504 let dist = calculate_euclidean_distance(&x.row(idx_i), &x.row(idx_j));
505 min_dist = F::min(min_dist, dist);
506 }
507 }
508
509 separation_matrix[i][j] = min_dist;
510 }
511 }
512
513 let mut cluster_validity = Vec::new();
515
516 for (i, &_) in unique_labels.iter().enumerate() {
517 let cluster_sparseness = sparseness_values[i];
518
519 let mut min_separation = F::max_value();
521 for j in 0..unique_labels.len() {
522 if i != j && separation_matrix[i][j] < min_separation {
523 min_separation = separation_matrix[i][j];
524 }
525 }
526
527 if min_separation == F::max_value() {
529 min_separation = F::zero();
530 }
531
532 let validity = if min_separation > F::zero() || cluster_sparseness > F::zero() {
534 (min_separation - cluster_sparseness) / F::max(min_separation, cluster_sparseness)
535 } else {
536 F::zero()
537 };
538
539 cluster_validity.push(validity);
540 }
541
542 let mut weighted_sum = F::zero();
544 let mut weight_sum = F::zero();
545
546 for (i, &label) in unique_labels.iter().enumerate() {
547 let weight = F::from(cluster_indices.get(&label).unwrap().len()).unwrap();
548 weighted_sum += weight * cluster_validity[i];
549 weight_sum += weight;
550 }
551
552 let dbcv = if weight_sum > F::zero() {
553 weighted_sum / weight_sum
554 } else {
555 F::zero()
556 };
557
558 Ok(dbcv)
560}
561
562#[allow(dead_code)]
564fn calculate_euclidean_distance<F, S1, S2>(a: &ArrayBase<S1, Ix1>, b: &ArrayBase<S2, Ix1>) -> F
565where
566 F: Float,
567 S1: Data<Elem = F>,
568 S2: Data<Elem = F>,
569{
570 let mut sum = F::zero();
571 for (x, y) in a.iter().zip(b.iter()) {
572 let diff = *x - *y;
573 sum = sum + diff * diff;
574 }
575 sum.sqrt()
576}
577
578#[cfg(test)]
579mod tests {
580 use super::*;
581 use scirs2_core::ndarray::Array2;
582
583 #[test]
584 fn test_local_density_factor() {
585 let well_separated = Array2::from_shape_vec(
587 (6, 2),
588 vec![
589 1.0, 2.0, 1.5, 1.8, 1.2, 2.2, 10.0, 12.0, 10.2, 11.8, 10.5, 12.2,
590 ],
591 )
592 .unwrap();
593
594 let labels = scirs2_core::ndarray::array![0, 0, 0, 1, 1, 1];
595
596 let factors = local_density_factor(&well_separated, &labels, Some(2)).unwrap();
598
599 assert!(factors.len() == 2);
601 assert!(factors.contains_key(&0));
602 assert!(factors.contains_key(&1));
603
604 let varying_density = Array2::from_shape_vec(
606 (6, 2),
607 vec![
608 1.0, 1.1, 1.05, 1.05, 1.1, 1.0, 5.0, 5.0, 6.0, 6.0, 7.0, 7.0, ],
611 )
612 .unwrap();
613
614 let labels = scirs2_core::ndarray::array![0, 0, 0, 1, 1, 1];
615
616 let factors = local_density_factor(&varying_density, &labels, Some(2)).unwrap();
618
619 assert!(*factors.get(&0).unwrap() > *factors.get(&1).unwrap());
621 }
622
623 #[test]
624 fn test_relative_density_index() {
625 let well_separated = Array2::from_shape_vec(
627 (6, 2),
628 vec![
629 1.0, 2.0, 1.5, 1.8, 1.2, 2.2, 10.0, 12.0, 10.2, 11.8, 10.5, 12.2,
630 ],
631 )
632 .unwrap();
633
634 let labels = scirs2_core::ndarray::array![0, 0, 0, 1, 1, 1];
635
636 let rdi = relative_density_index(&well_separated, &labels, Some(2)).unwrap();
638
639 assert!(rdi > 1.0);
641
642 let overlapping = Array2::from_shape_vec(
644 (6, 2),
645 vec![1.0, 2.0, 1.5, 1.8, 3.0, 3.0, 3.0, 3.0, 4.0, 4.5, 5.0, 5.5],
646 )
647 .unwrap();
648
649 let labels = scirs2_core::ndarray::array![0, 0, 0, 1, 1, 1];
650
651 let rdi_overlapping = relative_density_index(&overlapping, &labels, Some(2)).unwrap();
653
654 assert!(rdi > rdi_overlapping);
656 }
657
658 #[test]
659 fn test_density_based_cluster_validity() {
660 let well_separated = Array2::from_shape_vec(
662 (6, 2),
663 vec![
664 1.0, 2.0, 1.5, 1.8, 1.2, 2.2, 10.0, 12.0, 10.2, 11.8, 10.5, 12.2,
665 ],
666 )
667 .unwrap();
668
669 let labels = scirs2_core::ndarray::array![0, 0, 0, 1, 1, 1];
670
671 let dbcv = density_based_cluster_validity(&well_separated, &labels, Some(2)).unwrap();
673
674 assert!(dbcv > 0.0);
676
677 let poor_clustering = Array2::from_shape_vec(
679 (6, 2),
680 vec![1.0, 2.0, 8.0, 9.0, 1.2, 2.2, 8.0, 9.0, 1.0, 2.0, 8.0, 9.0],
681 )
682 .unwrap();
683
684 let bad_labels = scirs2_core::ndarray::array![0, 0, 0, 1, 1, 1];
686
687 let bad_dbcv =
689 density_based_cluster_validity(&poor_clustering, &bad_labels, Some(2)).unwrap();
690
691 assert!(dbcv > bad_dbcv);
693 }
694}