scirs2_series/
clustering.rs

1//! Time series clustering and classification algorithms
2//!
3//! This module provides various methods for clustering and classifying time series:
4//! - Time series clustering algorithms (k-means, hierarchical, DBSCAN)
5//! - Distance measures for time series (DTW, Euclidean, correlation-based)
6//! - Time series classification methods (1-NN DTW, shapelet-based, feature-based)
7//! - Shape-based clustering and classification
8
9use crate::correlation::{CorrelationAnalyzer, DTWConfig};
10use crate::error::TimeSeriesError;
11use scirs2_core::ndarray::{s, Array1, Array2};
12use std::collections::{HashMap, HashSet};
13
14/// Result type for clustering and classification
15pub type ClusteringResult<T> = Result<T, TimeSeriesError>;
16
17/// Clustering result
18#[derive(Debug, Clone)]
19pub struct TimeSeriesClusteringResult {
20    /// Cluster assignments for each time series
21    pub cluster_labels: Array1<usize>,
22    /// Number of clusters
23    pub n_clusters: usize,
24    /// Cluster centers (centroids)
25    pub centroids: Vec<Array1<f64>>,
26    /// Inertia (sum of squared distances to centroids)
27    pub inertia: f64,
28    /// Silhouette coefficient for clustering quality
29    pub silhouette_score: f64,
30    /// Distance matrix between time series
31    pub distance_matrix: Array2<f64>,
32    /// Algorithm used for clustering
33    pub algorithm: ClusteringAlgorithm,
34}
35
36/// Classification result
37#[derive(Debug, Clone)]
38pub struct TimeSeriesClassificationResult {
39    /// Predicted class labels
40    pub predicted_labels: Array1<usize>,
41    /// Prediction probabilities (if available)
42    pub probabilities: Option<Array2<f64>>,
43    /// Confidence scores
44    pub confidence_scores: Array1<f64>,
45    /// Distance/similarity to nearest neighbors
46    pub neighbor_distances: Option<Array2<f64>>,
47    /// Classification algorithm used
48    pub algorithm: ClassificationAlgorithm,
49    /// Number of classes
50    pub n_classes: usize,
51}
52
53/// Shapelet discovery result
54#[derive(Debug, Clone)]
55pub struct ShapeletResult {
56    /// Discovered shapelets
57    pub shapelets: Vec<Shapelet>,
58    /// Information gain for each shapelet
59    pub information_gains: Array1<f64>,
60    /// Shapelet discovery algorithm used
61    pub algorithm: ShapeletAlgorithm,
62    /// Time series used for discovery
63    pub n_series: usize,
64}
65
66/// Individual shapelet
67#[derive(Debug, Clone)]
68pub struct Shapelet {
69    /// Shapelet subsequence
70    pub data: Array1<f64>,
71    /// Source series index
72    pub series_index: usize,
73    /// Starting position in source series
74    pub start_position: usize,
75    /// Length of shapelet
76    pub length: usize,
77    /// Information gain
78    pub information_gain: f64,
79    /// Quality score
80    pub quality: f64,
81}
82
83/// Clustering algorithms
84#[derive(Debug, Clone, Copy)]
85pub enum ClusteringAlgorithm {
86    /// K-means clustering
87    KMeans,
88    /// Hierarchical clustering
89    Hierarchical,
90    /// DBSCAN density-based clustering
91    DBSCAN,
92    /// Spectral clustering
93    Spectral,
94    /// Gaussian mixture models
95    GMM,
96}
97
98/// Classification algorithms
99#[derive(Debug, Clone, Copy)]
100pub enum ClassificationAlgorithm {
101    /// k-Nearest Neighbors with DTW
102    KnnDTW,
103    /// Shapelet-based classification
104    Shapelet,
105    /// Feature-based classification
106    FeatureBased,
107    /// Ensemble methods
108    Ensemble,
109}
110
111/// Distance measures for time series
112#[derive(Debug, Clone, Copy)]
113pub enum TimeSeriesDistance {
114    /// Euclidean distance
115    Euclidean,
116    /// Dynamic Time Warping
117    DTW,
118    /// Correlation-based distance
119    Correlation,
120    /// Cosine distance
121    Cosine,
122    /// Manhattan distance
123    Manhattan,
124    /// Minkowski distance
125    Minkowski(f64),
126}
127
128/// Shapelet discovery algorithms
129#[derive(Debug, Clone, Copy)]
130pub enum ShapeletAlgorithm {
131    /// Brute force search
132    BruteForce,
133    /// Fast shapelet discovery
134    Fast,
135    /// Learning shapelets
136    Learning,
137}
138
139/// Configuration for k-means clustering
140#[derive(Debug, Clone)]
141pub struct KMeansConfig {
142    /// Number of clusters
143    pub n_clusters: usize,
144    /// Maximum number of iterations
145    pub max_iterations: usize,
146    /// Convergence tolerance
147    pub tolerance: f64,
148    /// Number of random initializations
149    pub n_init: usize,
150    /// Distance measure
151    pub distance: TimeSeriesDistance,
152    /// Random seed
153    pub random_seed: Option<u64>,
154}
155
156impl Default for KMeansConfig {
157    fn default() -> Self {
158        Self {
159            n_clusters: 3,
160            max_iterations: 300,
161            tolerance: 1e-4,
162            n_init: 10,
163            distance: TimeSeriesDistance::Euclidean,
164            random_seed: None,
165        }
166    }
167}
168
169/// Configuration for hierarchical clustering
170#[derive(Debug, Clone)]
171pub struct HierarchicalConfig {
172    /// Number of clusters (if None, use all levels)
173    pub n_clusters: Option<usize>,
174    /// Linkage method
175    pub linkage: LinkageMethod,
176    /// Distance measure
177    pub distance: TimeSeriesDistance,
178    /// Distance threshold for automatic cluster determination
179    pub distance_threshold: Option<f64>,
180}
181
182/// Linkage methods for hierarchical clustering
183#[derive(Debug, Clone, Copy)]
184pub enum LinkageMethod {
185    /// Single linkage (minimum distance)
186    Single,
187    /// Complete linkage (maximum distance)
188    Complete,
189    /// Average linkage
190    Average,
191    /// Ward linkage (minimizes within-cluster variance)
192    Ward,
193}
194
195impl Default for HierarchicalConfig {
196    fn default() -> Self {
197        Self {
198            n_clusters: Some(3),
199            linkage: LinkageMethod::Average,
200            distance: TimeSeriesDistance::Euclidean,
201            distance_threshold: None,
202        }
203    }
204}
205
206/// Configuration for DBSCAN clustering
207#[derive(Debug, Clone)]
208pub struct DBSCANConfig {
209    /// Epsilon (neighborhood radius)
210    pub eps: f64,
211    /// Minimum number of points for core point
212    pub min_samples: usize,
213    /// Distance measure
214    pub distance: TimeSeriesDistance,
215}
216
217impl Default for DBSCANConfig {
218    fn default() -> Self {
219        Self {
220            eps: 0.5,
221            min_samples: 5,
222            distance: TimeSeriesDistance::Euclidean,
223        }
224    }
225}
226
227/// Configuration for k-NN classification
228#[derive(Debug, Clone)]
229pub struct KNNConfig {
230    /// Number of neighbors
231    pub k: usize,
232    /// Distance measure
233    pub distance: TimeSeriesDistance,
234    /// Weighting scheme
235    pub weights: WeightingScheme,
236    /// DTW configuration (if using DTW distance)
237    pub dtw_config: Option<DTWConfig>,
238}
239
240/// Weighting schemes for k-NN
241#[derive(Debug, Clone, Copy)]
242pub enum WeightingScheme {
243    /// Uniform weights
244    Uniform,
245    /// Distance-based weights
246    Distance,
247    /// Exponential weights
248    Exponential,
249}
250
251impl Default for KNNConfig {
252    fn default() -> Self {
253        Self {
254            k: 5,
255            distance: TimeSeriesDistance::DTW,
256            weights: WeightingScheme::Distance,
257            dtw_config: Some(DTWConfig::default()),
258        }
259    }
260}
261
262/// Configuration for shapelet discovery
263#[derive(Debug, Clone)]
264pub struct ShapeletConfig {
265    /// Minimum shapelet length
266    pub min_length: usize,
267    /// Maximum shapelet length
268    pub max_length: usize,
269    /// Number of shapelets to discover
270    pub nshapelets: usize,
271    /// Discovery algorithm
272    pub algorithm: ShapeletAlgorithm,
273    /// Minimum information gain threshold
274    pub min_info_gain: f64,
275    /// Random seed
276    pub random_seed: Option<u64>,
277}
278
279impl Default for ShapeletConfig {
280    fn default() -> Self {
281        Self {
282            min_length: 10,
283            max_length: 50,
284            nshapelets: 10,
285            algorithm: ShapeletAlgorithm::Fast,
286            min_info_gain: 0.01,
287            random_seed: None,
288        }
289    }
290}
291
292/// Main struct for time series clustering and classification
293pub struct TimeSeriesClusterer {
294    /// Random seed for reproducibility
295    pub random_seed: Option<u64>,
296    /// Correlation analyzer for DTW computations
297    correlation_analyzer: CorrelationAnalyzer,
298}
299
300impl TimeSeriesClusterer {
301    /// Create a new time series clusterer
302    pub fn new() -> Self {
303        Self {
304            random_seed: None,
305            correlation_analyzer: CorrelationAnalyzer::new(),
306        }
307    }
308
309    /// Create a new clusterer with random seed
310    pub fn with_seed(seed: u64) -> Self {
311        Self {
312            random_seed: Some(seed),
313            correlation_analyzer: CorrelationAnalyzer::with_seed(seed),
314        }
315    }
316
317    /// Perform k-means clustering on time series data
318    ///
319    /// # Arguments
320    ///
321    /// * `data` - Matrix where each row is a time series
322    /// * `config` - Configuration for k-means clustering
323    ///
324    /// # Returns
325    ///
326    /// Result containing clustering results and statistics
327    pub fn kmeans_clustering(
328        &self,
329        data: &Array2<f64>,
330        config: &KMeansConfig,
331    ) -> ClusteringResult<TimeSeriesClusteringResult> {
332        let (n_series, _series_length) = data.dim();
333
334        if n_series < config.n_clusters {
335            return Err(TimeSeriesError::InvalidInput(
336                "Number of time series must be at least equal to number of clusters".to_string(),
337            ));
338        }
339
340        // Compute distance matrix
341        let distance_matrix = self.compute_distance_matrix(data, config.distance)?;
342
343        let mut best_result = None;
344        let mut best_inertia = f64::INFINITY;
345
346        // Try multiple random initializations
347        for _init in 0..config.n_init {
348            // Initialize centroids randomly
349            let mut centroids = self.initialize_centroids_randomly(data, config.n_clusters)?;
350            let mut cluster_labels = Array1::zeros(n_series);
351            let mut prev_inertia = f64::INFINITY;
352
353            for _iter in 0..config.max_iterations {
354                // Assign points to nearest centroids
355                for i in 0..n_series {
356                    let mut min_distance = f64::INFINITY;
357                    let mut best_cluster = 0;
358
359                    #[allow(clippy::needless_range_loop)]
360                    for k in 0..config.n_clusters {
361                        let distance = self.compute_series_distance(
362                            &data.row(i).to_owned(),
363                            &centroids[k],
364                            config.distance,
365                        )?;
366
367                        if distance < min_distance {
368                            min_distance = distance;
369                            best_cluster = k;
370                        }
371                    }
372
373                    cluster_labels[i] = best_cluster;
374                }
375
376                // Update centroids
377                #[allow(clippy::needless_range_loop)]
378                for k in 0..config.n_clusters {
379                    let cluster_points: Vec<usize> = cluster_labels
380                        .iter()
381                        .enumerate()
382                        .filter(|(_, &label)| label == k)
383                        .map(|(i_, _)| i_)
384                        .collect();
385
386                    if !cluster_points.is_empty() {
387                        centroids[k] = self.compute_centroid(data, &cluster_points)?;
388                    }
389                }
390
391                // Compute inertia
392                let inertia =
393                    self.compute_inertia(data, &centroids, &cluster_labels, config.distance)?;
394
395                // Check convergence
396                if (prev_inertia - inertia).abs() < config.tolerance {
397                    break;
398                }
399                prev_inertia = inertia;
400            }
401
402            let final_inertia =
403                self.compute_inertia(data, &centroids, &cluster_labels, config.distance)?;
404
405            if final_inertia < best_inertia {
406                best_inertia = final_inertia;
407
408                let silhouette_score =
409                    self.compute_silhouette_score(&distance_matrix, &cluster_labels)?;
410
411                best_result = Some(TimeSeriesClusteringResult {
412                    cluster_labels,
413                    n_clusters: config.n_clusters,
414                    centroids,
415                    inertia: final_inertia,
416                    silhouette_score,
417                    distance_matrix: distance_matrix.clone(),
418                    algorithm: ClusteringAlgorithm::KMeans,
419                });
420            }
421        }
422
423        best_result.ok_or_else(|| {
424            TimeSeriesError::ComputationError("K-means clustering failed".to_string())
425        })
426    }
427
428    /// Perform hierarchical clustering on time series data
429    ///
430    /// # Arguments
431    ///
432    /// * `data` - Matrix where each row is a time series
433    /// * `config` - Configuration for hierarchical clustering
434    ///
435    /// # Returns
436    ///
437    /// Result containing clustering results and dendrogram information
438    pub fn hierarchical_clustering(
439        &self,
440        data: &Array2<f64>,
441        config: &HierarchicalConfig,
442    ) -> ClusteringResult<TimeSeriesClusteringResult> {
443        let _n_series = data.nrows();
444
445        // Compute distance matrix
446        let distance_matrix = self.compute_distance_matrix(data, config.distance)?;
447
448        // Perform hierarchical clustering
449        let cluster_labels = self.hierarchical_clustering_impl(&distance_matrix, config)?;
450
451        let n_clusters = cluster_labels.iter().max().unwrap_or(&0) + 1;
452
453        // Compute centroids for each cluster
454        let mut centroids = Vec::new();
455        for k in 0..n_clusters {
456            let cluster_points: Vec<usize> = cluster_labels
457                .iter()
458                .enumerate()
459                .filter(|(_, &label)| label == k)
460                .map(|(i_, _)| i_)
461                .collect();
462
463            if !cluster_points.is_empty() {
464                centroids.push(self.compute_centroid(data, &cluster_points)?);
465            } else {
466                centroids.push(Array1::zeros(data.ncols()));
467            }
468        }
469
470        let inertia = self.compute_inertia(data, &centroids, &cluster_labels, config.distance)?;
471        let silhouette_score = self.compute_silhouette_score(&distance_matrix, &cluster_labels)?;
472
473        Ok(TimeSeriesClusteringResult {
474            cluster_labels,
475            n_clusters,
476            centroids,
477            inertia,
478            silhouette_score,
479            distance_matrix,
480            algorithm: ClusteringAlgorithm::Hierarchical,
481        })
482    }
483
484    /// Perform DBSCAN clustering on time series data
485    ///
486    /// # Arguments
487    ///
488    /// * `data` - Matrix where each row is a time series
489    /// * `config` - Configuration for DBSCAN clustering
490    ///
491    /// # Returns
492    ///
493    /// Result containing clustering results (noise points labeled as -1)
494    pub fn dbscan_clustering(
495        &self,
496        data: &Array2<f64>,
497        config: &DBSCANConfig,
498    ) -> ClusteringResult<TimeSeriesClusteringResult> {
499        let n_series = data.nrows();
500
501        // Compute distance matrix
502        let distance_matrix = self.compute_distance_matrix(data, config.distance)?;
503
504        // DBSCAN algorithm
505        let mut cluster_labels = Array1::from_elem(n_series, usize::MAX); // MAX = unvisited
506        let mut cluster_id = 0;
507
508        for i in 0..n_series {
509            if cluster_labels[i] != usize::MAX {
510                continue; // Already processed
511            }
512
513            let neighbors = self.get_neighbors(&distance_matrix, i, config.eps);
514
515            if neighbors.len() < config.min_samples {
516                cluster_labels[i] = usize::MAX - 1; // Mark as noise (will be converted to special value)
517            } else {
518                self.expand_cluster(
519                    &distance_matrix,
520                    &mut cluster_labels,
521                    i,
522                    &neighbors,
523                    cluster_id,
524                    config,
525                )?;
526                cluster_id += 1;
527            }
528        }
529
530        // Convert noise points and renumber clusters
531        let mut final_labels = Array1::zeros(n_series);
532        let mut cluster_map = HashMap::new();
533        let mut next_cluster = 0;
534
535        for i in 0..n_series {
536            if cluster_labels[i] == usize::MAX - 1 {
537                final_labels[i] = usize::MAX; // Keep as special noise marker
538            } else if cluster_labels[i] != usize::MAX {
539                let mapped_cluster = *cluster_map.entry(cluster_labels[i]).or_insert_with(|| {
540                    let cluster = next_cluster;
541                    next_cluster += 1;
542                    cluster
543                });
544                final_labels[i] = mapped_cluster;
545            }
546        }
547
548        let n_clusters = next_cluster;
549
550        // Compute centroids (excluding noise points)
551        let mut centroids = Vec::new();
552        for k in 0..n_clusters {
553            let cluster_points: Vec<usize> = final_labels
554                .iter()
555                .enumerate()
556                .filter(|(_, &label)| label == k)
557                .map(|(i_, _)| i_)
558                .collect();
559
560            if !cluster_points.is_empty() {
561                centroids.push(self.compute_centroid(data, &cluster_points)?);
562            } else {
563                centroids.push(Array1::zeros(data.ncols()));
564            }
565        }
566
567        // For inertia and silhouette calculation, treat noise as separate points
568        let labels_for_metrics =
569            final_labels.mapv(|x| if x == usize::MAX { usize::MAX } else { x });
570
571        let inertia = if n_clusters > 0 {
572            self.compute_inertia_with_noise(data, &centroids, &labels_for_metrics, config.distance)?
573        } else {
574            0.0
575        };
576
577        let silhouette_score = if n_clusters > 1 {
578            self.compute_silhouette_score_with_noise(&distance_matrix, &labels_for_metrics)?
579        } else {
580            0.0
581        };
582
583        Ok(TimeSeriesClusteringResult {
584            cluster_labels: labels_for_metrics,
585            n_clusters,
586            centroids,
587            inertia,
588            silhouette_score,
589            distance_matrix,
590            algorithm: ClusteringAlgorithm::DBSCAN,
591        })
592    }
593
594    /// Perform k-NN classification on time series data
595    ///
596    /// # Arguments
597    ///
598    /// * `train_data` - Training time series matrix
599    /// * `train_labels` - Training labels
600    /// * `test_data` - Test time series matrix
601    /// * `config` - Configuration for k-NN classification
602    ///
603    /// # Returns
604    ///
605    /// Result containing classification predictions and confidence scores
606    pub fn knn_classification(
607        &self,
608        train_data: &Array2<f64>,
609        train_labels: &Array1<usize>,
610        test_data: &Array2<f64>,
611        config: &KNNConfig,
612    ) -> ClusteringResult<TimeSeriesClassificationResult> {
613        let n_train = train_data.nrows();
614        let n_test = test_data.nrows();
615        let n_classes = train_labels.iter().max().unwrap_or(&0) + 1;
616
617        if train_labels.len() != n_train {
618            return Err(TimeSeriesError::InvalidInput(
619                "Training _data and _labels must have the same number of samples".to_string(),
620            ));
621        }
622
623        let mut predicted_labels = Array1::zeros(n_test);
624        let mut confidence_scores = Array1::zeros(n_test);
625        let mut probabilities = Array2::zeros((n_test, n_classes));
626        let mut neighbor_distances = Array2::zeros((n_test, config.k.min(n_train)));
627
628        for i in 0..n_test {
629            let test_series = test_data.row(i).to_owned();
630
631            // Compute distances to all training samples
632            let mut distances = Vec::new();
633            for j in 0..n_train {
634                let train_series = train_data.row(j).to_owned();
635                let distance =
636                    self.compute_series_distance(&test_series, &train_series, config.distance)?;
637                distances.push((distance, train_labels[j]));
638            }
639
640            // Sort by distance and take k nearest neighbors
641            distances.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
642            let k_neighbors = distances.iter().take(config.k).cloned().collect::<Vec<_>>();
643
644            // Store neighbor distances
645            for (idx, (dist_, _)) in k_neighbors.iter().enumerate() {
646                if idx < neighbor_distances.ncols() {
647                    neighbor_distances[[i, idx]] = *dist_;
648                }
649            }
650
651            // Compute weighted votes
652            let mut class_votes = vec![0.0; n_classes];
653            let mut total_weight = 0.0;
654
655            for (distance, class_label) in &k_neighbors {
656                let weight = match config.weights {
657                    WeightingScheme::Uniform => 1.0,
658                    WeightingScheme::Distance => {
659                        if *distance < f64::EPSILON {
660                            1e6
661                        } else {
662                            1.0 / distance
663                        }
664                    }
665                    WeightingScheme::Exponential => (-distance).exp(),
666                };
667
668                class_votes[*class_label] += weight;
669                total_weight += weight;
670            }
671
672            // Normalize to get probabilities
673            if total_weight > 0.0 {
674                for class in 0..n_classes {
675                    probabilities[[i, class]] = class_votes[class] / total_weight;
676                }
677            }
678
679            // Predict class with highest vote
680            let predicted_class = class_votes
681                .iter()
682                .enumerate()
683                .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
684                .map(|(idx_, _)| idx_)
685                .unwrap_or(0);
686
687            predicted_labels[i] = predicted_class;
688            confidence_scores[i] = class_votes[predicted_class] / total_weight;
689        }
690
691        Ok(TimeSeriesClassificationResult {
692            predicted_labels,
693            probabilities: Some(probabilities),
694            confidence_scores,
695            neighbor_distances: Some(neighbor_distances),
696            algorithm: ClassificationAlgorithm::KnnDTW,
697            n_classes,
698        })
699    }
700
701    /// Discover shapelets in time series data
702    ///
703    /// # Arguments
704    ///
705    /// * `data` - Time series data matrix
706    /// * `labels` - Class labels for supervised shapelet discovery
707    /// * `config` - Configuration for shapelet discovery
708    ///
709    /// # Returns
710    ///
711    /// Result containing discovered shapelets and their quality measures
712    pub fn discovershapelets(
713        &self,
714        data: &Array2<f64>,
715        labels: &Array1<usize>,
716        config: &ShapeletConfig,
717    ) -> ClusteringResult<ShapeletResult> {
718        let n_series = data.nrows();
719        let series_length = data.ncols();
720
721        if labels.len() != n_series {
722            return Err(TimeSeriesError::InvalidInput(
723                "Data and labels must have the same number of samples".to_string(),
724            ));
725        }
726
727        let mut candidateshapelets = Vec::new();
728
729        // Generate candidate shapelets
730        for series_idx in 0..n_series {
731            for length in config.min_length..=config.max_length.min(series_length) {
732                for start_pos in 0..=(series_length - length) {
733                    let shapelet_data = data
734                        .slice(s![series_idx, start_pos..start_pos + length])
735                        .to_owned();
736
737                    let shapelet = Shapelet {
738                        data: shapelet_data,
739                        series_index: series_idx,
740                        start_position: start_pos,
741                        length,
742                        information_gain: 0.0,
743                        quality: 0.0,
744                    };
745
746                    candidateshapelets.push(shapelet);
747                }
748            }
749        }
750
751        // Evaluate candidate shapelets
752        let mut shapelet_scores = Vec::new();
753
754        for (idx, shapelet) in candidateshapelets.iter().enumerate() {
755            let info_gain = self.computeshapelet_information_gain(data, labels, shapelet)?;
756
757            if info_gain >= config.min_info_gain {
758                shapelet_scores.push((idx, info_gain));
759            }
760        }
761
762        // Sort by information gain and select top shapelets
763        shapelet_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
764
765        let n_selected = config.nshapelets.min(shapelet_scores.len());
766        let mut selectedshapelets = Vec::new();
767        let mut information_gains = Array1::zeros(n_selected);
768
769        for i in 0..n_selected {
770            let (shapelet_idx, info_gain) = shapelet_scores[i];
771            let mut shapelet = candidateshapelets[shapelet_idx].clone();
772            shapelet.information_gain = info_gain;
773            shapelet.quality = info_gain; // Simple quality measure
774
775            selectedshapelets.push(shapelet);
776            information_gains[i] = info_gain;
777        }
778
779        Ok(ShapeletResult {
780            shapelets: selectedshapelets,
781            information_gains,
782            algorithm: config.algorithm,
783            n_series,
784        })
785    }
786
787    // Helper methods
788
789    fn compute_distance_matrix(
790        &self,
791        data: &Array2<f64>,
792        distance: TimeSeriesDistance,
793    ) -> ClusteringResult<Array2<f64>> {
794        let n_series = data.nrows();
795        let mut distance_matrix = Array2::zeros((n_series, n_series));
796
797        for i in 0..n_series {
798            for j in i..n_series {
799                let dist = if i == j {
800                    0.0
801                } else {
802                    self.compute_series_distance(
803                        &data.row(i).to_owned(),
804                        &data.row(j).to_owned(),
805                        distance,
806                    )?
807                };
808
809                distance_matrix[[i, j]] = dist;
810                distance_matrix[[j, i]] = dist;
811            }
812        }
813
814        Ok(distance_matrix)
815    }
816
817    fn compute_series_distance(
818        &self,
819        x: &Array1<f64>,
820        y: &Array1<f64>,
821        distance: TimeSeriesDistance,
822    ) -> ClusteringResult<f64> {
823        match distance {
824            TimeSeriesDistance::Euclidean => Ok((x - y).mapv(|d| d * d).sum().sqrt()),
825            TimeSeriesDistance::DTW => {
826                let dtw_config = DTWConfig::default();
827                let result = self
828                    .correlation_analyzer
829                    .dynamic_time_warping(x, y, &dtw_config)
830                    .map_err(|_| {
831                        TimeSeriesError::ComputationError("DTW computation failed".to_string())
832                    })?;
833                Ok(result.distance)
834            }
835            TimeSeriesDistance::Correlation => {
836                let correlation = self.pearson_correlation(x, y)?;
837                Ok(1.0 - correlation.abs()) // Convert to distance
838            }
839            TimeSeriesDistance::Cosine => {
840                let dot_product = x.dot(y);
841                let norm_x = (x.dot(x)).sqrt();
842                let norm_y = (y.dot(y)).sqrt();
843
844                if norm_x < f64::EPSILON || norm_y < f64::EPSILON {
845                    Ok(1.0)
846                } else {
847                    let cosine_sim = dot_product / (norm_x * norm_y);
848                    Ok(1.0 - cosine_sim.abs())
849                }
850            }
851            TimeSeriesDistance::Manhattan => Ok((x - y).mapv(|d| d.abs()).sum()),
852            TimeSeriesDistance::Minkowski(p) => {
853                Ok((x - y).mapv(|d| d.abs().powf(p)).sum().powf(1.0 / p))
854            }
855        }
856    }
857
858    fn pearson_correlation(&self, x: &Array1<f64>, y: &Array1<f64>) -> ClusteringResult<f64> {
859        if x.len() != y.len() || x.is_empty() {
860            return Ok(0.0);
861        }
862
863        let n = x.len() as f64;
864        let mean_x = x.sum() / n;
865        let mean_y = y.sum() / n;
866
867        let mut numerator = 0.0;
868        let mut sum_sq_x = 0.0;
869        let mut sum_sq_y = 0.0;
870
871        for (xi, yi) in x.iter().zip(y.iter()) {
872            let diff_x = xi - mean_x;
873            let diff_y = yi - mean_y;
874            numerator += diff_x * diff_y;
875            sum_sq_x += diff_x * diff_x;
876            sum_sq_y += diff_y * diff_y;
877        }
878
879        let denominator = (sum_sq_x * sum_sq_y).sqrt();
880        if denominator < f64::EPSILON {
881            return Ok(0.0);
882        }
883
884        Ok(numerator / denominator)
885    }
886
887    fn initialize_centroids_randomly(
888        &self,
889        data: &Array2<f64>,
890        k: usize,
891    ) -> ClusteringResult<Vec<Array1<f64>>> {
892        let n_series = data.nrows();
893        let mut centroids = Vec::new();
894
895        // Simple random initialization - pick k random time series as initial centroids
896        use std::collections::hash_map::DefaultHasher;
897        use std::hash::{Hash, Hasher};
898
899        let mut hasher = DefaultHasher::new();
900        if let Some(seed) = self.random_seed {
901            seed.hash(&mut hasher);
902        }
903
904        let mut selected_indices = HashSet::new();
905
906        for _i in 0..k {
907            let mut idx = (hasher.finish() as usize) % n_series;
908            while selected_indices.contains(&idx) {
909                hasher.write_usize(idx);
910                idx = (hasher.finish() as usize) % n_series;
911            }
912            selected_indices.insert(idx);
913            centroids.push(data.row(idx).to_owned());
914        }
915
916        Ok(centroids)
917    }
918
919    fn compute_centroid(
920        &self,
921        data: &Array2<f64>,
922        indices: &[usize],
923    ) -> ClusteringResult<Array1<f64>> {
924        if indices.is_empty() {
925            return Ok(Array1::zeros(data.ncols()));
926        }
927
928        let mut centroid = Array1::zeros(data.ncols());
929
930        for &idx in indices {
931            centroid = centroid + data.row(idx);
932        }
933
934        centroid /= indices.len() as f64;
935        Ok(centroid)
936    }
937
938    fn compute_inertia(
939        &self,
940        data: &Array2<f64>,
941        centroids: &[Array1<f64>],
942        labels: &Array1<usize>,
943        distance: TimeSeriesDistance,
944    ) -> ClusteringResult<f64> {
945        let mut inertia = 0.0;
946
947        for i in 0..data.nrows() {
948            let cluster_id = labels[i];
949            if cluster_id < centroids.len() {
950                let dist = self.compute_series_distance(
951                    &data.row(i).to_owned(),
952                    &centroids[cluster_id],
953                    distance,
954                )?;
955                inertia += dist * dist;
956            }
957        }
958
959        Ok(inertia)
960    }
961
962    fn compute_inertia_with_noise(
963        &self,
964        data: &Array2<f64>,
965        centroids: &[Array1<f64>],
966        labels: &Array1<usize>,
967        distance: TimeSeriesDistance,
968    ) -> ClusteringResult<f64> {
969        let mut inertia = 0.0;
970
971        for i in 0..data.nrows() {
972            let cluster_id = labels[i];
973            if cluster_id != usize::MAX && cluster_id < centroids.len() {
974                let dist = self.compute_series_distance(
975                    &data.row(i).to_owned(),
976                    &centroids[cluster_id],
977                    distance,
978                )?;
979                inertia += dist * dist;
980            }
981        }
982
983        Ok(inertia)
984    }
985
986    fn compute_silhouette_score(
987        &self,
988        distance_matrix: &Array2<f64>,
989        labels: &Array1<usize>,
990    ) -> ClusteringResult<f64> {
991        let n_samples = distance_matrix.nrows();
992        let mut silhouette_scores = Vec::new();
993
994        for i in 0..n_samples {
995            let cluster_i = labels[i];
996
997            // Compute average distance to points in same cluster (a_i)
998            let same_cluster_distances: Vec<f64> = (0..n_samples)
999                .filter(|&j| j != i && labels[j] == cluster_i)
1000                .map(|j| distance_matrix[[i, j]])
1001                .collect();
1002
1003            let a_i = if same_cluster_distances.is_empty() {
1004                0.0
1005            } else {
1006                same_cluster_distances.iter().sum::<f64>() / same_cluster_distances.len() as f64
1007            };
1008
1009            // Compute minimum average distance to points in other clusters (b_i)
1010            let unique_clusters: HashSet<usize> = labels.iter().cloned().collect();
1011            let mut min_avg_distance = f64::INFINITY;
1012
1013            for &other_cluster in &unique_clusters {
1014                if other_cluster != cluster_i {
1015                    let other_cluster_distances: Vec<f64> = (0..n_samples)
1016                        .filter(|&j| labels[j] == other_cluster)
1017                        .map(|j| distance_matrix[[i, j]])
1018                        .collect();
1019
1020                    if !other_cluster_distances.is_empty() {
1021                        let avg_distance = other_cluster_distances.iter().sum::<f64>()
1022                            / other_cluster_distances.len() as f64;
1023                        min_avg_distance = min_avg_distance.min(avg_distance);
1024                    }
1025                }
1026            }
1027
1028            let b_i = min_avg_distance;
1029
1030            // Compute silhouette coefficient
1031            let silhouette = if a_i.max(b_i) > 0.0 {
1032                (b_i - a_i) / a_i.max(b_i)
1033            } else {
1034                0.0
1035            };
1036
1037            silhouette_scores.push(silhouette);
1038        }
1039
1040        Ok(silhouette_scores.iter().sum::<f64>() / silhouette_scores.len() as f64)
1041    }
1042
1043    fn compute_silhouette_score_with_noise(
1044        &self,
1045        distance_matrix: &Array2<f64>,
1046        labels: &Array1<usize>,
1047    ) -> ClusteringResult<f64> {
1048        // Filter out noise points (labeled as usize::MAX) for silhouette calculation
1049        let valid_indices: Vec<usize> = labels
1050            .iter()
1051            .enumerate()
1052            .filter(|(_, &label)| label != usize::MAX)
1053            .map(|(i_, _)| i_)
1054            .collect();
1055
1056        if valid_indices.len() < 2 {
1057            return Ok(0.0);
1058        }
1059
1060        let filtered_labels = Array1::from_iter(valid_indices.iter().map(|&i| labels[i]));
1061        let mut filtered_distance_matrix =
1062            Array2::zeros((valid_indices.len(), valid_indices.len()));
1063
1064        for (i, &idx_i) in valid_indices.iter().enumerate() {
1065            for (j, &idx_j) in valid_indices.iter().enumerate() {
1066                filtered_distance_matrix[[i, j]] = distance_matrix[[idx_i, idx_j]];
1067            }
1068        }
1069
1070        self.compute_silhouette_score(&filtered_distance_matrix, &filtered_labels)
1071    }
1072
1073    fn hierarchical_clustering_impl(
1074        &self,
1075        distance_matrix: &Array2<f64>,
1076        config: &HierarchicalConfig,
1077    ) -> ClusteringResult<Array1<usize>> {
1078        let n_samples = distance_matrix.nrows();
1079        let mut clusters: Vec<Vec<usize>> = (0..n_samples).map(|i| vec![i]).collect();
1080        let cluster_distances = distance_matrix.clone();
1081
1082        let target_clusters = config.n_clusters.unwrap_or(1);
1083
1084        while clusters.len() > target_clusters {
1085            // Find closest pair of clusters
1086            let mut min_distance = f64::INFINITY;
1087            let mut merge_i = 0;
1088            let mut merge_j = 1;
1089
1090            for i in 0..clusters.len() {
1091                for j in i + 1..clusters.len() {
1092                    let distance = self.compute_cluster_distance(
1093                        &clusters[i],
1094                        &clusters[j],
1095                        &cluster_distances,
1096                        config.linkage,
1097                    );
1098
1099                    if distance < min_distance {
1100                        min_distance = distance;
1101                        merge_i = i;
1102                        merge_j = j;
1103                    }
1104                }
1105            }
1106
1107            // Check distance threshold
1108            if let Some(threshold) = config.distance_threshold {
1109                if min_distance > threshold {
1110                    break;
1111                }
1112            }
1113
1114            // Merge clusters
1115            let mut new_cluster = clusters[merge_i].clone();
1116            new_cluster.extend(clusters[merge_j].clone());
1117
1118            // Remove old clusters (remove higher index first)
1119            if merge_i < merge_j {
1120                clusters.remove(merge_j);
1121                clusters.remove(merge_i);
1122            } else {
1123                clusters.remove(merge_i);
1124                clusters.remove(merge_j);
1125            }
1126
1127            clusters.push(new_cluster);
1128        }
1129
1130        // Create labels array
1131        let mut labels = Array1::zeros(n_samples);
1132        for (cluster_id, cluster) in clusters.iter().enumerate() {
1133            for &sample_id in cluster {
1134                labels[sample_id] = cluster_id;
1135            }
1136        }
1137
1138        Ok(labels)
1139    }
1140
1141    #[allow(clippy::only_used_in_recursion)]
1142    fn compute_cluster_distance(
1143        &self,
1144        cluster1: &[usize],
1145        cluster2: &[usize],
1146        distance_matrix: &Array2<f64>,
1147        linkage: LinkageMethod,
1148    ) -> f64 {
1149        match linkage {
1150            LinkageMethod::Single => {
1151                // Minimum distance between any two points
1152                let mut min_dist = f64::INFINITY;
1153                for &i in cluster1 {
1154                    for &j in cluster2 {
1155                        min_dist = min_dist.min(distance_matrix[[i, j]]);
1156                    }
1157                }
1158                min_dist
1159            }
1160            LinkageMethod::Complete => {
1161                // Maximum distance between any two points
1162                let mut max_dist: f64 = 0.0;
1163                for &i in cluster1 {
1164                    for &j in cluster2 {
1165                        max_dist = max_dist.max(distance_matrix[[i, j]]);
1166                    }
1167                }
1168                max_dist
1169            }
1170            LinkageMethod::Average => {
1171                // Average distance between all pairs
1172                let mut sum_dist = 0.0;
1173                let mut count = 0;
1174                for &i in cluster1 {
1175                    for &j in cluster2 {
1176                        sum_dist += distance_matrix[[i, j]];
1177                        count += 1;
1178                    }
1179                }
1180                if count > 0 {
1181                    sum_dist / count as f64
1182                } else {
1183                    0.0
1184                }
1185            }
1186            LinkageMethod::Ward => {
1187                // Ward linkage - simplified implementation
1188                // This would require proper centroid calculation
1189                self.compute_cluster_distance(
1190                    cluster1,
1191                    cluster2,
1192                    distance_matrix,
1193                    LinkageMethod::Average,
1194                )
1195            }
1196        }
1197    }
1198
1199    fn get_neighbors(&self, distancematrix: &Array2<f64>, point: usize, eps: f64) -> Vec<usize> {
1200        (0..distancematrix.nrows())
1201            .filter(|&i| i != point && distancematrix[[point, i]] <= eps)
1202            .collect()
1203    }
1204
1205    fn expand_cluster(
1206        &self,
1207        distance_matrix: &Array2<f64>,
1208        labels: &mut Array1<usize>,
1209        point: usize,
1210        neighbors: &[usize],
1211        cluster_id: usize,
1212        config: &DBSCANConfig,
1213    ) -> ClusteringResult<()> {
1214        labels[point] = cluster_id;
1215        let mut seed_set = neighbors.to_vec();
1216        let mut i = 0;
1217
1218        while i < seed_set.len() {
1219            let q = seed_set[i];
1220
1221            if labels[q] == usize::MAX - 1 {
1222                // Was noise, now part of cluster
1223                labels[q] = cluster_id;
1224            }
1225
1226            if labels[q] == usize::MAX {
1227                // Unvisited
1228                labels[q] = cluster_id;
1229                let q_neighbors = self.get_neighbors(distance_matrix, q, config.eps);
1230
1231                if q_neighbors.len() >= config.min_samples {
1232                    // Add new neighbors to seed set
1233                    for &neighbor in &q_neighbors {
1234                        if !seed_set.contains(&neighbor) {
1235                            seed_set.push(neighbor);
1236                        }
1237                    }
1238                }
1239            }
1240
1241            i += 1;
1242        }
1243
1244        Ok(())
1245    }
1246
1247    fn computeshapelet_information_gain(
1248        &self,
1249        data: &Array2<f64>,
1250        labels: &Array1<usize>,
1251        shapelet: &Shapelet,
1252    ) -> ClusteringResult<f64> {
1253        let n_samples = data.nrows();
1254        let mut distances = Vec::new();
1255
1256        // Compute distances from each time series to the shapelet
1257        for i in 0..n_samples {
1258            let series = data.row(i).to_owned();
1259            let min_distance = self.compute_min_distance_toshapelet(&series, &shapelet.data)?;
1260            distances.push((min_distance, labels[i]));
1261        }
1262
1263        // Sort by distance
1264        distances.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
1265
1266        // Compute initial entropy
1267        let initial_entropy = self.compute_entropy(labels)?;
1268        let mut best_info_gain: f64 = 0.0;
1269
1270        // Try different split points
1271        for split_idx in 1..distances.len() {
1272            let _threshold = distances[split_idx].0;
1273
1274            // Split data based on threshold
1275            let left_labels: Vec<usize> = distances[..split_idx]
1276                .iter()
1277                .map(|(_, label)| *label)
1278                .collect();
1279            let right_labels: Vec<usize> = distances[split_idx..]
1280                .iter()
1281                .map(|(_, label)| *label)
1282                .collect();
1283
1284            if left_labels.is_empty() || right_labels.is_empty() {
1285                continue;
1286            }
1287
1288            // Compute weighted entropy after split
1289            let left_entropy = self.compute_entropy(&Array1::from_vec(left_labels.clone()))?;
1290            let right_entropy = self.compute_entropy(&Array1::from_vec(right_labels.clone()))?;
1291
1292            let left_weight = left_labels.len() as f64 / n_samples as f64;
1293            let right_weight = right_labels.len() as f64 / n_samples as f64;
1294
1295            let weighted_entropy = left_weight * left_entropy + right_weight * right_entropy;
1296            let info_gain = initial_entropy - weighted_entropy;
1297
1298            best_info_gain = best_info_gain.max(info_gain);
1299        }
1300
1301        Ok(best_info_gain)
1302    }
1303
1304    fn compute_min_distance_toshapelet(
1305        &self,
1306        series: &Array1<f64>,
1307        shapelet: &Array1<f64>,
1308    ) -> ClusteringResult<f64> {
1309        let series_len = series.len();
1310        let shapelet_len = shapelet.len();
1311
1312        if shapelet_len > series_len {
1313            return Ok(f64::INFINITY);
1314        }
1315
1316        let mut min_distance = f64::INFINITY;
1317
1318        for start_pos in 0..=(series_len - shapelet_len) {
1319            let subsequence = series
1320                .slice(s![start_pos..start_pos + shapelet_len])
1321                .to_owned();
1322            let distance = self.compute_series_distance(
1323                &subsequence,
1324                shapelet,
1325                TimeSeriesDistance::Euclidean,
1326            )?;
1327            min_distance = min_distance.min(distance);
1328        }
1329
1330        Ok(min_distance)
1331    }
1332
1333    fn compute_entropy(&self, labels: &Array1<usize>) -> ClusteringResult<f64> {
1334        if labels.is_empty() {
1335            return Ok(0.0);
1336        }
1337
1338        let mut class_counts = HashMap::new();
1339        for &label in labels {
1340            *class_counts.entry(label).or_insert(0) += 1;
1341        }
1342
1343        let total = labels.len() as f64;
1344        let mut entropy = 0.0;
1345
1346        for count in class_counts.values() {
1347            let p = *count as f64 / total;
1348            if p > 0.0 {
1349                entropy -= p * p.log2();
1350            }
1351        }
1352
1353        Ok(entropy)
1354    }
1355}
1356
1357impl Default for TimeSeriesClusterer {
1358    fn default() -> Self {
1359        Self::new()
1360    }
1361}
1362
1363#[cfg(test)]
1364mod tests {
1365    use super::*;
1366    use scirs2_core::ndarray::{Array1, Array2};
1367
1368    #[test]
1369    fn test_kmeans_clustering() {
1370        // Create simple test data
1371        let mut data = Array2::zeros((6, 10));
1372
1373        // Cluster 1: sine waves
1374        for i in 0..3 {
1375            for j in 0..10 {
1376                data[[i, j]] = (j as f64 * 0.1).sin();
1377            }
1378        }
1379
1380        // Cluster 2: cosine waves
1381        for i in 3..6 {
1382            for j in 0..10 {
1383                data[[i, j]] = (j as f64 * 0.1).cos();
1384            }
1385        }
1386
1387        let clusterer = TimeSeriesClusterer::new();
1388        let config = KMeansConfig {
1389            n_clusters: 2,
1390            ..Default::default()
1391        };
1392
1393        let result = clusterer.kmeans_clustering(&data, &config).unwrap();
1394
1395        assert_eq!(result.n_clusters, 2);
1396        assert_eq!(result.cluster_labels.len(), 6);
1397        assert_eq!(result.centroids.len(), 2);
1398        assert!(result.inertia >= 0.0);
1399        assert!(result.silhouette_score >= -1.0 && result.silhouette_score <= 1.0);
1400    }
1401
1402    #[test]
1403    fn test_knn_classification() {
1404        // Create simple test data
1405        let mut train_data = Array2::zeros((4, 10));
1406        let mut test_data = Array2::zeros((2, 10));
1407
1408        // Class 0: sine waves
1409        for i in 0..2 {
1410            for j in 0..10 {
1411                train_data[[i, j]] = (j as f64 * 0.1).sin();
1412            }
1413        }
1414
1415        // Class 1: cosine waves
1416        for i in 2..4 {
1417            for j in 0..10 {
1418                train_data[[i, j]] = (j as f64 * 0.1).cos();
1419            }
1420        }
1421
1422        // Test data: one sine, one cosine
1423        for j in 0..10 {
1424            test_data[[0, j]] = (j as f64 * 0.1).sin();
1425            test_data[[1, j]] = (j as f64 * 0.1).cos();
1426        }
1427
1428        let train_labels = Array1::from_vec(vec![0, 0, 1, 1]);
1429
1430        let clusterer = TimeSeriesClusterer::new();
1431        let config = KNNConfig {
1432            k: 3,
1433            distance: TimeSeriesDistance::Euclidean,
1434            ..Default::default()
1435        };
1436
1437        let result = clusterer
1438            .knn_classification(&train_data, &train_labels, &test_data, &config)
1439            .unwrap();
1440
1441        assert_eq!(result.predicted_labels.len(), 2);
1442        assert_eq!(result.n_classes, 2);
1443        assert!(result.probabilities.is_some());
1444        assert_eq!(result.confidence_scores.len(), 2);
1445
1446        // Should classify correctly
1447        assert_eq!(result.predicted_labels[0], 0); // Sine should be class 0
1448        assert_eq!(result.predicted_labels[1], 1); // Cosine should be class 1
1449    }
1450
1451    #[test]
1452    fn test_discovershapelets() {
1453        // Create simple test data with discriminative patterns
1454        let mut data = Array2::zeros((4, 20));
1455        let labels = Array1::from_vec(vec![0, 0, 1, 1]);
1456
1457        // Class 0: has a peak in the middle
1458        for i in 0..2 {
1459            for j in 0..20 {
1460                if (8..=12).contains(&j) {
1461                    data[[i, j]] = 1.0; // Peak
1462                } else {
1463                    data[[i, j]] = 0.0;
1464                }
1465            }
1466        }
1467
1468        // Class 1: no peak
1469        for i in 2..4 {
1470            for j in 0..20 {
1471                data[[i, j]] = 0.0;
1472            }
1473        }
1474
1475        let clusterer = TimeSeriesClusterer::new();
1476        let config = ShapeletConfig {
1477            min_length: 3,
1478            max_length: 8,
1479            nshapelets: 5,
1480            ..Default::default()
1481        };
1482
1483        let result = clusterer
1484            .discovershapelets(&data, &labels, &config)
1485            .unwrap();
1486
1487        assert!(result.shapelets.len() <= config.nshapelets);
1488        assert_eq!(result.information_gains.len(), result.shapelets.len());
1489        assert_eq!(result.n_series, 4);
1490
1491        // Should find some shapelets with positive information gain
1492        assert!(result.information_gains.iter().any(|&gain| gain > 0.0));
1493    }
1494}
scirs2_series/clustering.rs

scirs2_series/
clustering.rs