1use crate::correlation::{CorrelationAnalyzer, DTWConfig};
10use crate::error::TimeSeriesError;
11use scirs2_core::ndarray::{s, Array1, Array2};
12use std::collections::{HashMap, HashSet};
13
14pub type ClusteringResult<T> = Result<T, TimeSeriesError>;
16
17#[derive(Debug, Clone)]
19pub struct TimeSeriesClusteringResult {
20 pub cluster_labels: Array1<usize>,
22 pub n_clusters: usize,
24 pub centroids: Vec<Array1<f64>>,
26 pub inertia: f64,
28 pub silhouette_score: f64,
30 pub distance_matrix: Array2<f64>,
32 pub algorithm: ClusteringAlgorithm,
34}
35
36#[derive(Debug, Clone)]
38pub struct TimeSeriesClassificationResult {
39 pub predicted_labels: Array1<usize>,
41 pub probabilities: Option<Array2<f64>>,
43 pub confidence_scores: Array1<f64>,
45 pub neighbor_distances: Option<Array2<f64>>,
47 pub algorithm: ClassificationAlgorithm,
49 pub n_classes: usize,
51}
52
53#[derive(Debug, Clone)]
55pub struct ShapeletResult {
56 pub shapelets: Vec<Shapelet>,
58 pub information_gains: Array1<f64>,
60 pub algorithm: ShapeletAlgorithm,
62 pub n_series: usize,
64}
65
66#[derive(Debug, Clone)]
68pub struct Shapelet {
69 pub data: Array1<f64>,
71 pub series_index: usize,
73 pub start_position: usize,
75 pub length: usize,
77 pub information_gain: f64,
79 pub quality: f64,
81}
82
83#[derive(Debug, Clone, Copy)]
85pub enum ClusteringAlgorithm {
86 KMeans,
88 Hierarchical,
90 DBSCAN,
92 Spectral,
94 GMM,
96}
97
98#[derive(Debug, Clone, Copy)]
100pub enum ClassificationAlgorithm {
101 KnnDTW,
103 Shapelet,
105 FeatureBased,
107 Ensemble,
109}
110
111#[derive(Debug, Clone, Copy)]
113pub enum TimeSeriesDistance {
114 Euclidean,
116 DTW,
118 Correlation,
120 Cosine,
122 Manhattan,
124 Minkowski(f64),
126}
127
128#[derive(Debug, Clone, Copy)]
130pub enum ShapeletAlgorithm {
131 BruteForce,
133 Fast,
135 Learning,
137}
138
139#[derive(Debug, Clone)]
141pub struct KMeansConfig {
142 pub n_clusters: usize,
144 pub max_iterations: usize,
146 pub tolerance: f64,
148 pub n_init: usize,
150 pub distance: TimeSeriesDistance,
152 pub random_seed: Option<u64>,
154}
155
156impl Default for KMeansConfig {
157 fn default() -> Self {
158 Self {
159 n_clusters: 3,
160 max_iterations: 300,
161 tolerance: 1e-4,
162 n_init: 10,
163 distance: TimeSeriesDistance::Euclidean,
164 random_seed: None,
165 }
166 }
167}
168
169#[derive(Debug, Clone)]
171pub struct HierarchicalConfig {
172 pub n_clusters: Option<usize>,
174 pub linkage: LinkageMethod,
176 pub distance: TimeSeriesDistance,
178 pub distance_threshold: Option<f64>,
180}
181
182#[derive(Debug, Clone, Copy)]
184pub enum LinkageMethod {
185 Single,
187 Complete,
189 Average,
191 Ward,
193}
194
195impl Default for HierarchicalConfig {
196 fn default() -> Self {
197 Self {
198 n_clusters: Some(3),
199 linkage: LinkageMethod::Average,
200 distance: TimeSeriesDistance::Euclidean,
201 distance_threshold: None,
202 }
203 }
204}
205
206#[derive(Debug, Clone)]
208pub struct DBSCANConfig {
209 pub eps: f64,
211 pub min_samples: usize,
213 pub distance: TimeSeriesDistance,
215}
216
217impl Default for DBSCANConfig {
218 fn default() -> Self {
219 Self {
220 eps: 0.5,
221 min_samples: 5,
222 distance: TimeSeriesDistance::Euclidean,
223 }
224 }
225}
226
227#[derive(Debug, Clone)]
229pub struct KNNConfig {
230 pub k: usize,
232 pub distance: TimeSeriesDistance,
234 pub weights: WeightingScheme,
236 pub dtw_config: Option<DTWConfig>,
238}
239
240#[derive(Debug, Clone, Copy)]
242pub enum WeightingScheme {
243 Uniform,
245 Distance,
247 Exponential,
249}
250
251impl Default for KNNConfig {
252 fn default() -> Self {
253 Self {
254 k: 5,
255 distance: TimeSeriesDistance::DTW,
256 weights: WeightingScheme::Distance,
257 dtw_config: Some(DTWConfig::default()),
258 }
259 }
260}
261
262#[derive(Debug, Clone)]
264pub struct ShapeletConfig {
265 pub min_length: usize,
267 pub max_length: usize,
269 pub nshapelets: usize,
271 pub algorithm: ShapeletAlgorithm,
273 pub min_info_gain: f64,
275 pub random_seed: Option<u64>,
277}
278
279impl Default for ShapeletConfig {
280 fn default() -> Self {
281 Self {
282 min_length: 10,
283 max_length: 50,
284 nshapelets: 10,
285 algorithm: ShapeletAlgorithm::Fast,
286 min_info_gain: 0.01,
287 random_seed: None,
288 }
289 }
290}
291
292pub struct TimeSeriesClusterer {
294 pub random_seed: Option<u64>,
296 correlation_analyzer: CorrelationAnalyzer,
298}
299
300impl TimeSeriesClusterer {
301 pub fn new() -> Self {
303 Self {
304 random_seed: None,
305 correlation_analyzer: CorrelationAnalyzer::new(),
306 }
307 }
308
309 pub fn with_seed(seed: u64) -> Self {
311 Self {
312 random_seed: Some(seed),
313 correlation_analyzer: CorrelationAnalyzer::with_seed(seed),
314 }
315 }
316
317 pub fn kmeans_clustering(
328 &self,
329 data: &Array2<f64>,
330 config: &KMeansConfig,
331 ) -> ClusteringResult<TimeSeriesClusteringResult> {
332 let (n_series, _series_length) = data.dim();
333
334 if n_series < config.n_clusters {
335 return Err(TimeSeriesError::InvalidInput(
336 "Number of time series must be at least equal to number of clusters".to_string(),
337 ));
338 }
339
340 let distance_matrix = self.compute_distance_matrix(data, config.distance)?;
342
343 let mut best_result = None;
344 let mut best_inertia = f64::INFINITY;
345
346 for _init in 0..config.n_init {
348 let mut centroids = self.initialize_centroids_randomly(data, config.n_clusters)?;
350 let mut cluster_labels = Array1::zeros(n_series);
351 let mut prev_inertia = f64::INFINITY;
352
353 for _iter in 0..config.max_iterations {
354 for i in 0..n_series {
356 let mut min_distance = f64::INFINITY;
357 let mut best_cluster = 0;
358
359 #[allow(clippy::needless_range_loop)]
360 for k in 0..config.n_clusters {
361 let distance = self.compute_series_distance(
362 &data.row(i).to_owned(),
363 ¢roids[k],
364 config.distance,
365 )?;
366
367 if distance < min_distance {
368 min_distance = distance;
369 best_cluster = k;
370 }
371 }
372
373 cluster_labels[i] = best_cluster;
374 }
375
376 #[allow(clippy::needless_range_loop)]
378 for k in 0..config.n_clusters {
379 let cluster_points: Vec<usize> = cluster_labels
380 .iter()
381 .enumerate()
382 .filter(|(_, &label)| label == k)
383 .map(|(i_, _)| i_)
384 .collect();
385
386 if !cluster_points.is_empty() {
387 centroids[k] = self.compute_centroid(data, &cluster_points)?;
388 }
389 }
390
391 let inertia =
393 self.compute_inertia(data, ¢roids, &cluster_labels, config.distance)?;
394
395 if (prev_inertia - inertia).abs() < config.tolerance {
397 break;
398 }
399 prev_inertia = inertia;
400 }
401
402 let final_inertia =
403 self.compute_inertia(data, ¢roids, &cluster_labels, config.distance)?;
404
405 if final_inertia < best_inertia {
406 best_inertia = final_inertia;
407
408 let silhouette_score =
409 self.compute_silhouette_score(&distance_matrix, &cluster_labels)?;
410
411 best_result = Some(TimeSeriesClusteringResult {
412 cluster_labels,
413 n_clusters: config.n_clusters,
414 centroids,
415 inertia: final_inertia,
416 silhouette_score,
417 distance_matrix: distance_matrix.clone(),
418 algorithm: ClusteringAlgorithm::KMeans,
419 });
420 }
421 }
422
423 best_result.ok_or_else(|| {
424 TimeSeriesError::ComputationError("K-means clustering failed".to_string())
425 })
426 }
427
428 pub fn hierarchical_clustering(
439 &self,
440 data: &Array2<f64>,
441 config: &HierarchicalConfig,
442 ) -> ClusteringResult<TimeSeriesClusteringResult> {
443 let _n_series = data.nrows();
444
445 let distance_matrix = self.compute_distance_matrix(data, config.distance)?;
447
448 let cluster_labels = self.hierarchical_clustering_impl(&distance_matrix, config)?;
450
451 let n_clusters = cluster_labels.iter().max().unwrap_or(&0) + 1;
452
453 let mut centroids = Vec::new();
455 for k in 0..n_clusters {
456 let cluster_points: Vec<usize> = cluster_labels
457 .iter()
458 .enumerate()
459 .filter(|(_, &label)| label == k)
460 .map(|(i_, _)| i_)
461 .collect();
462
463 if !cluster_points.is_empty() {
464 centroids.push(self.compute_centroid(data, &cluster_points)?);
465 } else {
466 centroids.push(Array1::zeros(data.ncols()));
467 }
468 }
469
470 let inertia = self.compute_inertia(data, ¢roids, &cluster_labels, config.distance)?;
471 let silhouette_score = self.compute_silhouette_score(&distance_matrix, &cluster_labels)?;
472
473 Ok(TimeSeriesClusteringResult {
474 cluster_labels,
475 n_clusters,
476 centroids,
477 inertia,
478 silhouette_score,
479 distance_matrix,
480 algorithm: ClusteringAlgorithm::Hierarchical,
481 })
482 }
483
484 pub fn dbscan_clustering(
495 &self,
496 data: &Array2<f64>,
497 config: &DBSCANConfig,
498 ) -> ClusteringResult<TimeSeriesClusteringResult> {
499 let n_series = data.nrows();
500
501 let distance_matrix = self.compute_distance_matrix(data, config.distance)?;
503
504 let mut cluster_labels = Array1::from_elem(n_series, usize::MAX); let mut cluster_id = 0;
507
508 for i in 0..n_series {
509 if cluster_labels[i] != usize::MAX {
510 continue; }
512
513 let neighbors = self.get_neighbors(&distance_matrix, i, config.eps);
514
515 if neighbors.len() < config.min_samples {
516 cluster_labels[i] = usize::MAX - 1; } else {
518 self.expand_cluster(
519 &distance_matrix,
520 &mut cluster_labels,
521 i,
522 &neighbors,
523 cluster_id,
524 config,
525 )?;
526 cluster_id += 1;
527 }
528 }
529
530 let mut final_labels = Array1::zeros(n_series);
532 let mut cluster_map = HashMap::new();
533 let mut next_cluster = 0;
534
535 for i in 0..n_series {
536 if cluster_labels[i] == usize::MAX - 1 {
537 final_labels[i] = usize::MAX; } else if cluster_labels[i] != usize::MAX {
539 let mapped_cluster = *cluster_map.entry(cluster_labels[i]).or_insert_with(|| {
540 let cluster = next_cluster;
541 next_cluster += 1;
542 cluster
543 });
544 final_labels[i] = mapped_cluster;
545 }
546 }
547
548 let n_clusters = next_cluster;
549
550 let mut centroids = Vec::new();
552 for k in 0..n_clusters {
553 let cluster_points: Vec<usize> = final_labels
554 .iter()
555 .enumerate()
556 .filter(|(_, &label)| label == k)
557 .map(|(i_, _)| i_)
558 .collect();
559
560 if !cluster_points.is_empty() {
561 centroids.push(self.compute_centroid(data, &cluster_points)?);
562 } else {
563 centroids.push(Array1::zeros(data.ncols()));
564 }
565 }
566
567 let labels_for_metrics =
569 final_labels.mapv(|x| if x == usize::MAX { usize::MAX } else { x });
570
571 let inertia = if n_clusters > 0 {
572 self.compute_inertia_with_noise(data, ¢roids, &labels_for_metrics, config.distance)?
573 } else {
574 0.0
575 };
576
577 let silhouette_score = if n_clusters > 1 {
578 self.compute_silhouette_score_with_noise(&distance_matrix, &labels_for_metrics)?
579 } else {
580 0.0
581 };
582
583 Ok(TimeSeriesClusteringResult {
584 cluster_labels: labels_for_metrics,
585 n_clusters,
586 centroids,
587 inertia,
588 silhouette_score,
589 distance_matrix,
590 algorithm: ClusteringAlgorithm::DBSCAN,
591 })
592 }
593
594 pub fn knn_classification(
607 &self,
608 train_data: &Array2<f64>,
609 train_labels: &Array1<usize>,
610 test_data: &Array2<f64>,
611 config: &KNNConfig,
612 ) -> ClusteringResult<TimeSeriesClassificationResult> {
613 let n_train = train_data.nrows();
614 let n_test = test_data.nrows();
615 let n_classes = train_labels.iter().max().unwrap_or(&0) + 1;
616
617 if train_labels.len() != n_train {
618 return Err(TimeSeriesError::InvalidInput(
619 "Training _data and _labels must have the same number of samples".to_string(),
620 ));
621 }
622
623 let mut predicted_labels = Array1::zeros(n_test);
624 let mut confidence_scores = Array1::zeros(n_test);
625 let mut probabilities = Array2::zeros((n_test, n_classes));
626 let mut neighbor_distances = Array2::zeros((n_test, config.k.min(n_train)));
627
628 for i in 0..n_test {
629 let test_series = test_data.row(i).to_owned();
630
631 let mut distances = Vec::new();
633 for j in 0..n_train {
634 let train_series = train_data.row(j).to_owned();
635 let distance =
636 self.compute_series_distance(&test_series, &train_series, config.distance)?;
637 distances.push((distance, train_labels[j]));
638 }
639
640 distances.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
642 let k_neighbors = distances.iter().take(config.k).cloned().collect::<Vec<_>>();
643
644 for (idx, (dist_, _)) in k_neighbors.iter().enumerate() {
646 if idx < neighbor_distances.ncols() {
647 neighbor_distances[[i, idx]] = *dist_;
648 }
649 }
650
651 let mut class_votes = vec![0.0; n_classes];
653 let mut total_weight = 0.0;
654
655 for (distance, class_label) in &k_neighbors {
656 let weight = match config.weights {
657 WeightingScheme::Uniform => 1.0,
658 WeightingScheme::Distance => {
659 if *distance < f64::EPSILON {
660 1e6
661 } else {
662 1.0 / distance
663 }
664 }
665 WeightingScheme::Exponential => (-distance).exp(),
666 };
667
668 class_votes[*class_label] += weight;
669 total_weight += weight;
670 }
671
672 if total_weight > 0.0 {
674 for class in 0..n_classes {
675 probabilities[[i, class]] = class_votes[class] / total_weight;
676 }
677 }
678
679 let predicted_class = class_votes
681 .iter()
682 .enumerate()
683 .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
684 .map(|(idx_, _)| idx_)
685 .unwrap_or(0);
686
687 predicted_labels[i] = predicted_class;
688 confidence_scores[i] = class_votes[predicted_class] / total_weight;
689 }
690
691 Ok(TimeSeriesClassificationResult {
692 predicted_labels,
693 probabilities: Some(probabilities),
694 confidence_scores,
695 neighbor_distances: Some(neighbor_distances),
696 algorithm: ClassificationAlgorithm::KnnDTW,
697 n_classes,
698 })
699 }
700
701 pub fn discovershapelets(
713 &self,
714 data: &Array2<f64>,
715 labels: &Array1<usize>,
716 config: &ShapeletConfig,
717 ) -> ClusteringResult<ShapeletResult> {
718 let n_series = data.nrows();
719 let series_length = data.ncols();
720
721 if labels.len() != n_series {
722 return Err(TimeSeriesError::InvalidInput(
723 "Data and labels must have the same number of samples".to_string(),
724 ));
725 }
726
727 let mut candidateshapelets = Vec::new();
728
729 for series_idx in 0..n_series {
731 for length in config.min_length..=config.max_length.min(series_length) {
732 for start_pos in 0..=(series_length - length) {
733 let shapelet_data = data
734 .slice(s![series_idx, start_pos..start_pos + length])
735 .to_owned();
736
737 let shapelet = Shapelet {
738 data: shapelet_data,
739 series_index: series_idx,
740 start_position: start_pos,
741 length,
742 information_gain: 0.0,
743 quality: 0.0,
744 };
745
746 candidateshapelets.push(shapelet);
747 }
748 }
749 }
750
751 let mut shapelet_scores = Vec::new();
753
754 for (idx, shapelet) in candidateshapelets.iter().enumerate() {
755 let info_gain = self.computeshapelet_information_gain(data, labels, shapelet)?;
756
757 if info_gain >= config.min_info_gain {
758 shapelet_scores.push((idx, info_gain));
759 }
760 }
761
762 shapelet_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
764
765 let n_selected = config.nshapelets.min(shapelet_scores.len());
766 let mut selectedshapelets = Vec::new();
767 let mut information_gains = Array1::zeros(n_selected);
768
769 for i in 0..n_selected {
770 let (shapelet_idx, info_gain) = shapelet_scores[i];
771 let mut shapelet = candidateshapelets[shapelet_idx].clone();
772 shapelet.information_gain = info_gain;
773 shapelet.quality = info_gain; selectedshapelets.push(shapelet);
776 information_gains[i] = info_gain;
777 }
778
779 Ok(ShapeletResult {
780 shapelets: selectedshapelets,
781 information_gains,
782 algorithm: config.algorithm,
783 n_series,
784 })
785 }
786
787 fn compute_distance_matrix(
790 &self,
791 data: &Array2<f64>,
792 distance: TimeSeriesDistance,
793 ) -> ClusteringResult<Array2<f64>> {
794 let n_series = data.nrows();
795 let mut distance_matrix = Array2::zeros((n_series, n_series));
796
797 for i in 0..n_series {
798 for j in i..n_series {
799 let dist = if i == j {
800 0.0
801 } else {
802 self.compute_series_distance(
803 &data.row(i).to_owned(),
804 &data.row(j).to_owned(),
805 distance,
806 )?
807 };
808
809 distance_matrix[[i, j]] = dist;
810 distance_matrix[[j, i]] = dist;
811 }
812 }
813
814 Ok(distance_matrix)
815 }
816
817 fn compute_series_distance(
818 &self,
819 x: &Array1<f64>,
820 y: &Array1<f64>,
821 distance: TimeSeriesDistance,
822 ) -> ClusteringResult<f64> {
823 match distance {
824 TimeSeriesDistance::Euclidean => Ok((x - y).mapv(|d| d * d).sum().sqrt()),
825 TimeSeriesDistance::DTW => {
826 let dtw_config = DTWConfig::default();
827 let result = self
828 .correlation_analyzer
829 .dynamic_time_warping(x, y, &dtw_config)
830 .map_err(|_| {
831 TimeSeriesError::ComputationError("DTW computation failed".to_string())
832 })?;
833 Ok(result.distance)
834 }
835 TimeSeriesDistance::Correlation => {
836 let correlation = self.pearson_correlation(x, y)?;
837 Ok(1.0 - correlation.abs()) }
839 TimeSeriesDistance::Cosine => {
840 let dot_product = x.dot(y);
841 let norm_x = (x.dot(x)).sqrt();
842 let norm_y = (y.dot(y)).sqrt();
843
844 if norm_x < f64::EPSILON || norm_y < f64::EPSILON {
845 Ok(1.0)
846 } else {
847 let cosine_sim = dot_product / (norm_x * norm_y);
848 Ok(1.0 - cosine_sim.abs())
849 }
850 }
851 TimeSeriesDistance::Manhattan => Ok((x - y).mapv(|d| d.abs()).sum()),
852 TimeSeriesDistance::Minkowski(p) => {
853 Ok((x - y).mapv(|d| d.abs().powf(p)).sum().powf(1.0 / p))
854 }
855 }
856 }
857
858 fn pearson_correlation(&self, x: &Array1<f64>, y: &Array1<f64>) -> ClusteringResult<f64> {
859 if x.len() != y.len() || x.is_empty() {
860 return Ok(0.0);
861 }
862
863 let n = x.len() as f64;
864 let mean_x = x.sum() / n;
865 let mean_y = y.sum() / n;
866
867 let mut numerator = 0.0;
868 let mut sum_sq_x = 0.0;
869 let mut sum_sq_y = 0.0;
870
871 for (xi, yi) in x.iter().zip(y.iter()) {
872 let diff_x = xi - mean_x;
873 let diff_y = yi - mean_y;
874 numerator += diff_x * diff_y;
875 sum_sq_x += diff_x * diff_x;
876 sum_sq_y += diff_y * diff_y;
877 }
878
879 let denominator = (sum_sq_x * sum_sq_y).sqrt();
880 if denominator < f64::EPSILON {
881 return Ok(0.0);
882 }
883
884 Ok(numerator / denominator)
885 }
886
887 fn initialize_centroids_randomly(
888 &self,
889 data: &Array2<f64>,
890 k: usize,
891 ) -> ClusteringResult<Vec<Array1<f64>>> {
892 let n_series = data.nrows();
893 let mut centroids = Vec::new();
894
895 use std::collections::hash_map::DefaultHasher;
897 use std::hash::{Hash, Hasher};
898
899 let mut hasher = DefaultHasher::new();
900 if let Some(seed) = self.random_seed {
901 seed.hash(&mut hasher);
902 }
903
904 let mut selected_indices = HashSet::new();
905
906 for _i in 0..k {
907 let mut idx = (hasher.finish() as usize) % n_series;
908 while selected_indices.contains(&idx) {
909 hasher.write_usize(idx);
910 idx = (hasher.finish() as usize) % n_series;
911 }
912 selected_indices.insert(idx);
913 centroids.push(data.row(idx).to_owned());
914 }
915
916 Ok(centroids)
917 }
918
919 fn compute_centroid(
920 &self,
921 data: &Array2<f64>,
922 indices: &[usize],
923 ) -> ClusteringResult<Array1<f64>> {
924 if indices.is_empty() {
925 return Ok(Array1::zeros(data.ncols()));
926 }
927
928 let mut centroid = Array1::zeros(data.ncols());
929
930 for &idx in indices {
931 centroid = centroid + data.row(idx);
932 }
933
934 centroid /= indices.len() as f64;
935 Ok(centroid)
936 }
937
938 fn compute_inertia(
939 &self,
940 data: &Array2<f64>,
941 centroids: &[Array1<f64>],
942 labels: &Array1<usize>,
943 distance: TimeSeriesDistance,
944 ) -> ClusteringResult<f64> {
945 let mut inertia = 0.0;
946
947 for i in 0..data.nrows() {
948 let cluster_id = labels[i];
949 if cluster_id < centroids.len() {
950 let dist = self.compute_series_distance(
951 &data.row(i).to_owned(),
952 ¢roids[cluster_id],
953 distance,
954 )?;
955 inertia += dist * dist;
956 }
957 }
958
959 Ok(inertia)
960 }
961
962 fn compute_inertia_with_noise(
963 &self,
964 data: &Array2<f64>,
965 centroids: &[Array1<f64>],
966 labels: &Array1<usize>,
967 distance: TimeSeriesDistance,
968 ) -> ClusteringResult<f64> {
969 let mut inertia = 0.0;
970
971 for i in 0..data.nrows() {
972 let cluster_id = labels[i];
973 if cluster_id != usize::MAX && cluster_id < centroids.len() {
974 let dist = self.compute_series_distance(
975 &data.row(i).to_owned(),
976 ¢roids[cluster_id],
977 distance,
978 )?;
979 inertia += dist * dist;
980 }
981 }
982
983 Ok(inertia)
984 }
985
986 fn compute_silhouette_score(
987 &self,
988 distance_matrix: &Array2<f64>,
989 labels: &Array1<usize>,
990 ) -> ClusteringResult<f64> {
991 let n_samples = distance_matrix.nrows();
992 let mut silhouette_scores = Vec::new();
993
994 for i in 0..n_samples {
995 let cluster_i = labels[i];
996
997 let same_cluster_distances: Vec<f64> = (0..n_samples)
999 .filter(|&j| j != i && labels[j] == cluster_i)
1000 .map(|j| distance_matrix[[i, j]])
1001 .collect();
1002
1003 let a_i = if same_cluster_distances.is_empty() {
1004 0.0
1005 } else {
1006 same_cluster_distances.iter().sum::<f64>() / same_cluster_distances.len() as f64
1007 };
1008
1009 let unique_clusters: HashSet<usize> = labels.iter().cloned().collect();
1011 let mut min_avg_distance = f64::INFINITY;
1012
1013 for &other_cluster in &unique_clusters {
1014 if other_cluster != cluster_i {
1015 let other_cluster_distances: Vec<f64> = (0..n_samples)
1016 .filter(|&j| labels[j] == other_cluster)
1017 .map(|j| distance_matrix[[i, j]])
1018 .collect();
1019
1020 if !other_cluster_distances.is_empty() {
1021 let avg_distance = other_cluster_distances.iter().sum::<f64>()
1022 / other_cluster_distances.len() as f64;
1023 min_avg_distance = min_avg_distance.min(avg_distance);
1024 }
1025 }
1026 }
1027
1028 let b_i = min_avg_distance;
1029
1030 let silhouette = if a_i.max(b_i) > 0.0 {
1032 (b_i - a_i) / a_i.max(b_i)
1033 } else {
1034 0.0
1035 };
1036
1037 silhouette_scores.push(silhouette);
1038 }
1039
1040 Ok(silhouette_scores.iter().sum::<f64>() / silhouette_scores.len() as f64)
1041 }
1042
1043 fn compute_silhouette_score_with_noise(
1044 &self,
1045 distance_matrix: &Array2<f64>,
1046 labels: &Array1<usize>,
1047 ) -> ClusteringResult<f64> {
1048 let valid_indices: Vec<usize> = labels
1050 .iter()
1051 .enumerate()
1052 .filter(|(_, &label)| label != usize::MAX)
1053 .map(|(i_, _)| i_)
1054 .collect();
1055
1056 if valid_indices.len() < 2 {
1057 return Ok(0.0);
1058 }
1059
1060 let filtered_labels = Array1::from_iter(valid_indices.iter().map(|&i| labels[i]));
1061 let mut filtered_distance_matrix =
1062 Array2::zeros((valid_indices.len(), valid_indices.len()));
1063
1064 for (i, &idx_i) in valid_indices.iter().enumerate() {
1065 for (j, &idx_j) in valid_indices.iter().enumerate() {
1066 filtered_distance_matrix[[i, j]] = distance_matrix[[idx_i, idx_j]];
1067 }
1068 }
1069
1070 self.compute_silhouette_score(&filtered_distance_matrix, &filtered_labels)
1071 }
1072
1073 fn hierarchical_clustering_impl(
1074 &self,
1075 distance_matrix: &Array2<f64>,
1076 config: &HierarchicalConfig,
1077 ) -> ClusteringResult<Array1<usize>> {
1078 let n_samples = distance_matrix.nrows();
1079 let mut clusters: Vec<Vec<usize>> = (0..n_samples).map(|i| vec![i]).collect();
1080 let cluster_distances = distance_matrix.clone();
1081
1082 let target_clusters = config.n_clusters.unwrap_or(1);
1083
1084 while clusters.len() > target_clusters {
1085 let mut min_distance = f64::INFINITY;
1087 let mut merge_i = 0;
1088 let mut merge_j = 1;
1089
1090 for i in 0..clusters.len() {
1091 for j in i + 1..clusters.len() {
1092 let distance = self.compute_cluster_distance(
1093 &clusters[i],
1094 &clusters[j],
1095 &cluster_distances,
1096 config.linkage,
1097 );
1098
1099 if distance < min_distance {
1100 min_distance = distance;
1101 merge_i = i;
1102 merge_j = j;
1103 }
1104 }
1105 }
1106
1107 if let Some(threshold) = config.distance_threshold {
1109 if min_distance > threshold {
1110 break;
1111 }
1112 }
1113
1114 let mut new_cluster = clusters[merge_i].clone();
1116 new_cluster.extend(clusters[merge_j].clone());
1117
1118 if merge_i < merge_j {
1120 clusters.remove(merge_j);
1121 clusters.remove(merge_i);
1122 } else {
1123 clusters.remove(merge_i);
1124 clusters.remove(merge_j);
1125 }
1126
1127 clusters.push(new_cluster);
1128 }
1129
1130 let mut labels = Array1::zeros(n_samples);
1132 for (cluster_id, cluster) in clusters.iter().enumerate() {
1133 for &sample_id in cluster {
1134 labels[sample_id] = cluster_id;
1135 }
1136 }
1137
1138 Ok(labels)
1139 }
1140
1141 #[allow(clippy::only_used_in_recursion)]
1142 fn compute_cluster_distance(
1143 &self,
1144 cluster1: &[usize],
1145 cluster2: &[usize],
1146 distance_matrix: &Array2<f64>,
1147 linkage: LinkageMethod,
1148 ) -> f64 {
1149 match linkage {
1150 LinkageMethod::Single => {
1151 let mut min_dist = f64::INFINITY;
1153 for &i in cluster1 {
1154 for &j in cluster2 {
1155 min_dist = min_dist.min(distance_matrix[[i, j]]);
1156 }
1157 }
1158 min_dist
1159 }
1160 LinkageMethod::Complete => {
1161 let mut max_dist: f64 = 0.0;
1163 for &i in cluster1 {
1164 for &j in cluster2 {
1165 max_dist = max_dist.max(distance_matrix[[i, j]]);
1166 }
1167 }
1168 max_dist
1169 }
1170 LinkageMethod::Average => {
1171 let mut sum_dist = 0.0;
1173 let mut count = 0;
1174 for &i in cluster1 {
1175 for &j in cluster2 {
1176 sum_dist += distance_matrix[[i, j]];
1177 count += 1;
1178 }
1179 }
1180 if count > 0 {
1181 sum_dist / count as f64
1182 } else {
1183 0.0
1184 }
1185 }
1186 LinkageMethod::Ward => {
1187 self.compute_cluster_distance(
1190 cluster1,
1191 cluster2,
1192 distance_matrix,
1193 LinkageMethod::Average,
1194 )
1195 }
1196 }
1197 }
1198
1199 fn get_neighbors(&self, distancematrix: &Array2<f64>, point: usize, eps: f64) -> Vec<usize> {
1200 (0..distancematrix.nrows())
1201 .filter(|&i| i != point && distancematrix[[point, i]] <= eps)
1202 .collect()
1203 }
1204
1205 fn expand_cluster(
1206 &self,
1207 distance_matrix: &Array2<f64>,
1208 labels: &mut Array1<usize>,
1209 point: usize,
1210 neighbors: &[usize],
1211 cluster_id: usize,
1212 config: &DBSCANConfig,
1213 ) -> ClusteringResult<()> {
1214 labels[point] = cluster_id;
1215 let mut seed_set = neighbors.to_vec();
1216 let mut i = 0;
1217
1218 while i < seed_set.len() {
1219 let q = seed_set[i];
1220
1221 if labels[q] == usize::MAX - 1 {
1222 labels[q] = cluster_id;
1224 }
1225
1226 if labels[q] == usize::MAX {
1227 labels[q] = cluster_id;
1229 let q_neighbors = self.get_neighbors(distance_matrix, q, config.eps);
1230
1231 if q_neighbors.len() >= config.min_samples {
1232 for &neighbor in &q_neighbors {
1234 if !seed_set.contains(&neighbor) {
1235 seed_set.push(neighbor);
1236 }
1237 }
1238 }
1239 }
1240
1241 i += 1;
1242 }
1243
1244 Ok(())
1245 }
1246
1247 fn computeshapelet_information_gain(
1248 &self,
1249 data: &Array2<f64>,
1250 labels: &Array1<usize>,
1251 shapelet: &Shapelet,
1252 ) -> ClusteringResult<f64> {
1253 let n_samples = data.nrows();
1254 let mut distances = Vec::new();
1255
1256 for i in 0..n_samples {
1258 let series = data.row(i).to_owned();
1259 let min_distance = self.compute_min_distance_toshapelet(&series, &shapelet.data)?;
1260 distances.push((min_distance, labels[i]));
1261 }
1262
1263 distances.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
1265
1266 let initial_entropy = self.compute_entropy(labels)?;
1268 let mut best_info_gain: f64 = 0.0;
1269
1270 for split_idx in 1..distances.len() {
1272 let _threshold = distances[split_idx].0;
1273
1274 let left_labels: Vec<usize> = distances[..split_idx]
1276 .iter()
1277 .map(|(_, label)| *label)
1278 .collect();
1279 let right_labels: Vec<usize> = distances[split_idx..]
1280 .iter()
1281 .map(|(_, label)| *label)
1282 .collect();
1283
1284 if left_labels.is_empty() || right_labels.is_empty() {
1285 continue;
1286 }
1287
1288 let left_entropy = self.compute_entropy(&Array1::from_vec(left_labels.clone()))?;
1290 let right_entropy = self.compute_entropy(&Array1::from_vec(right_labels.clone()))?;
1291
1292 let left_weight = left_labels.len() as f64 / n_samples as f64;
1293 let right_weight = right_labels.len() as f64 / n_samples as f64;
1294
1295 let weighted_entropy = left_weight * left_entropy + right_weight * right_entropy;
1296 let info_gain = initial_entropy - weighted_entropy;
1297
1298 best_info_gain = best_info_gain.max(info_gain);
1299 }
1300
1301 Ok(best_info_gain)
1302 }
1303
1304 fn compute_min_distance_toshapelet(
1305 &self,
1306 series: &Array1<f64>,
1307 shapelet: &Array1<f64>,
1308 ) -> ClusteringResult<f64> {
1309 let series_len = series.len();
1310 let shapelet_len = shapelet.len();
1311
1312 if shapelet_len > series_len {
1313 return Ok(f64::INFINITY);
1314 }
1315
1316 let mut min_distance = f64::INFINITY;
1317
1318 for start_pos in 0..=(series_len - shapelet_len) {
1319 let subsequence = series
1320 .slice(s![start_pos..start_pos + shapelet_len])
1321 .to_owned();
1322 let distance = self.compute_series_distance(
1323 &subsequence,
1324 shapelet,
1325 TimeSeriesDistance::Euclidean,
1326 )?;
1327 min_distance = min_distance.min(distance);
1328 }
1329
1330 Ok(min_distance)
1331 }
1332
1333 fn compute_entropy(&self, labels: &Array1<usize>) -> ClusteringResult<f64> {
1334 if labels.is_empty() {
1335 return Ok(0.0);
1336 }
1337
1338 let mut class_counts = HashMap::new();
1339 for &label in labels {
1340 *class_counts.entry(label).or_insert(0) += 1;
1341 }
1342
1343 let total = labels.len() as f64;
1344 let mut entropy = 0.0;
1345
1346 for count in class_counts.values() {
1347 let p = *count as f64 / total;
1348 if p > 0.0 {
1349 entropy -= p * p.log2();
1350 }
1351 }
1352
1353 Ok(entropy)
1354 }
1355}
1356
1357impl Default for TimeSeriesClusterer {
1358 fn default() -> Self {
1359 Self::new()
1360 }
1361}
1362
1363#[cfg(test)]
1364mod tests {
1365 use super::*;
1366 use scirs2_core::ndarray::{Array1, Array2};
1367
1368 #[test]
1369 fn test_kmeans_clustering() {
1370 let mut data = Array2::zeros((6, 10));
1372
1373 for i in 0..3 {
1375 for j in 0..10 {
1376 data[[i, j]] = (j as f64 * 0.1).sin();
1377 }
1378 }
1379
1380 for i in 3..6 {
1382 for j in 0..10 {
1383 data[[i, j]] = (j as f64 * 0.1).cos();
1384 }
1385 }
1386
1387 let clusterer = TimeSeriesClusterer::new();
1388 let config = KMeansConfig {
1389 n_clusters: 2,
1390 ..Default::default()
1391 };
1392
1393 let result = clusterer.kmeans_clustering(&data, &config).unwrap();
1394
1395 assert_eq!(result.n_clusters, 2);
1396 assert_eq!(result.cluster_labels.len(), 6);
1397 assert_eq!(result.centroids.len(), 2);
1398 assert!(result.inertia >= 0.0);
1399 assert!(result.silhouette_score >= -1.0 && result.silhouette_score <= 1.0);
1400 }
1401
1402 #[test]
1403 fn test_knn_classification() {
1404 let mut train_data = Array2::zeros((4, 10));
1406 let mut test_data = Array2::zeros((2, 10));
1407
1408 for i in 0..2 {
1410 for j in 0..10 {
1411 train_data[[i, j]] = (j as f64 * 0.1).sin();
1412 }
1413 }
1414
1415 for i in 2..4 {
1417 for j in 0..10 {
1418 train_data[[i, j]] = (j as f64 * 0.1).cos();
1419 }
1420 }
1421
1422 for j in 0..10 {
1424 test_data[[0, j]] = (j as f64 * 0.1).sin();
1425 test_data[[1, j]] = (j as f64 * 0.1).cos();
1426 }
1427
1428 let train_labels = Array1::from_vec(vec![0, 0, 1, 1]);
1429
1430 let clusterer = TimeSeriesClusterer::new();
1431 let config = KNNConfig {
1432 k: 3,
1433 distance: TimeSeriesDistance::Euclidean,
1434 ..Default::default()
1435 };
1436
1437 let result = clusterer
1438 .knn_classification(&train_data, &train_labels, &test_data, &config)
1439 .unwrap();
1440
1441 assert_eq!(result.predicted_labels.len(), 2);
1442 assert_eq!(result.n_classes, 2);
1443 assert!(result.probabilities.is_some());
1444 assert_eq!(result.confidence_scores.len(), 2);
1445
1446 assert_eq!(result.predicted_labels[0], 0); assert_eq!(result.predicted_labels[1], 1); }
1450
1451 #[test]
1452 fn test_discovershapelets() {
1453 let mut data = Array2::zeros((4, 20));
1455 let labels = Array1::from_vec(vec![0, 0, 1, 1]);
1456
1457 for i in 0..2 {
1459 for j in 0..20 {
1460 if (8..=12).contains(&j) {
1461 data[[i, j]] = 1.0; } else {
1463 data[[i, j]] = 0.0;
1464 }
1465 }
1466 }
1467
1468 for i in 2..4 {
1470 for j in 0..20 {
1471 data[[i, j]] = 0.0;
1472 }
1473 }
1474
1475 let clusterer = TimeSeriesClusterer::new();
1476 let config = ShapeletConfig {
1477 min_length: 3,
1478 max_length: 8,
1479 nshapelets: 5,
1480 ..Default::default()
1481 };
1482
1483 let result = clusterer
1484 .discovershapelets(&data, &labels, &config)
1485 .unwrap();
1486
1487 assert!(result.shapelets.len() <= config.nshapelets);
1488 assert_eq!(result.information_gains.len(), result.shapelets.len());
1489 assert_eq!(result.n_series, 4);
1490
1491 assert!(result.information_gains.iter().any(|&gain| gain > 0.0));
1493 }
1494}