Skip to main content

scirs2_series/features/
temporal.rs

1//! Temporal pattern features for time series analysis
2//!
3//! This module provides comprehensive temporal pattern detection including
4//! motif discovery, discord detection, SAX representation, and shapelet extraction
5//! for discriminative pattern analysis.
6
7use scirs2_core::ndarray::{s, Array1, Array2};
8use scirs2_core::numeric::{Float, FromPrimitive};
9use std::fmt::Debug;
10
11use super::utils::{calculate_entropy, euclidean_distance_subsequence, gaussian_breakpoints};
12use crate::error::{Result, TimeSeriesError};
13
14/// Temporal pattern features for time series analysis
15#[derive(Debug, Clone)]
16pub struct TemporalPatternFeatures<F> {
17    /// Motifs (frequently occurring patterns)
18    pub motifs: Vec<MotifInfo<F>>,
19    /// Discord (unusual patterns)
20    pub discord_scores: Array1<F>,
21    /// SAX representation
22    pub sax_symbols: Vec<char>,
23    /// Shapelets (discriminative subsequences)
24    pub shapelets: Vec<ShapeletInfo<F>>,
25}
26
27impl<F> Default for TemporalPatternFeatures<F>
28where
29    F: Float + FromPrimitive,
30{
31    fn default() -> Self {
32        Self {
33            motifs: Vec::new(),
34            discord_scores: Array1::zeros(0),
35            sax_symbols: Vec::new(),
36            shapelets: Vec::new(),
37        }
38    }
39}
40
41/// Information about discovered motifs
42#[derive(Debug, Clone)]
43pub struct MotifInfo<F> {
44    /// Pattern length
45    pub length: usize,
46    /// Locations where motif occurs
47    pub positions: Vec<usize>,
48    /// Frequency of occurrence
49    pub frequency: usize,
50    /// Average distance between instances
51    pub avg_distance: F,
52}
53
54impl<F> Default for MotifInfo<F>
55where
56    F: Float + FromPrimitive,
57{
58    fn default() -> Self {
59        Self {
60            length: 0,
61            positions: Vec::new(),
62            frequency: 0,
63            avg_distance: F::zero(),
64        }
65    }
66}
67
68/// Information about shapelets
69#[derive(Debug, Clone)]
70pub struct ShapeletInfo<F> {
71    /// Shapelet subsequence
72    pub pattern: Array1<F>,
73    /// Starting position in original series
74    pub position: usize,
75    /// Length of shapelet
76    pub length: usize,
77    /// Information gain or discriminative power
78    pub information_gain: F,
79}
80
81impl<F> Default for ShapeletInfo<F>
82where
83    F: Float + FromPrimitive,
84{
85    fn default() -> Self {
86        Self {
87            pattern: Array1::zeros(0),
88            position: 0,
89            length: 0,
90            information_gain: F::zero(),
91        }
92    }
93}
94
95// =============================================================================
96// Main Calculation Functions
97// =============================================================================
98
99/// Calculate temporal pattern features
100#[allow(dead_code)]
101pub fn calculate_temporal_pattern_features<F>(
102    ts: &Array1<F>,
103    motif_length: Option<usize>,
104    max_motifs: usize,
105    k_neighbors: usize,
106    sax_word_length: usize,
107    sax_alphabet_size: usize,
108    detect_patterns: bool,
109) -> Result<TemporalPatternFeatures<F>>
110where
111    F: Float + FromPrimitive + Debug + Clone + scirs2_core::ndarray::ScalarOperand,
112{
113    if !detect_patterns {
114        return Ok(TemporalPatternFeatures::default());
115    }
116
117    let actual_motif_length = motif_length.unwrap_or(ts.len() / 10).max(3);
118
119    // Discover _motifs
120    let motifs = discover_motifs(ts, actual_motif_length, max_motifs)?;
121
122    // Calculate discord scores
123    let discord_scores = calculate_discord_scores(ts, actual_motif_length, k_neighbors)?;
124
125    // Convert to SAX representation
126    let sax_symbols = time_series_to_sax(ts, sax_word_length, sax_alphabet_size)?;
127
128    // For shapelets, we would need labeled data from multiple classes
129    // For now, return empty shapelets
130    let shapelets = Vec::new();
131
132    Ok(TemporalPatternFeatures {
133        motifs,
134        discord_scores,
135        sax_symbols,
136        shapelets,
137    })
138}
139
140// =============================================================================
141// Motif Discovery
142// =============================================================================
143
144/// Discover motifs (frequently occurring patterns) in time series
145///
146/// This function uses a brute-force approach to find the most frequently
147/// occurring subsequences of a given length in the time series.
148///
149/// # Arguments
150///
151/// * `ts` - The time series data
152/// * `motif_length` - Length of motifs to discover
153/// * `max_motifs` - Maximum number of motifs to return
154///
155/// # Returns
156///
157/// * Vector of discovered motifs with their locations and frequencies
158#[allow(dead_code)]
159pub fn discover_motifs<F>(
160    ts: &Array1<F>,
161    motif_length: usize,
162    max_motifs: usize,
163) -> Result<Vec<MotifInfo<F>>>
164where
165    F: Float + FromPrimitive + Debug + Clone,
166{
167    let n = ts.len();
168    if n < motif_length * 2 {
169        return Err(TimeSeriesError::FeatureExtractionError(
170            "Time series too short for motif discovery".to_string(),
171        ));
172    }
173
174    let num_subsequences = n - motif_length + 1;
175    let mut distances = Array2::zeros((num_subsequences, num_subsequences));
176
177    // Calculate distance matrix between all subsequences
178    for i in 0..num_subsequences {
179        for j in (i + 1)..num_subsequences {
180            let dist = euclidean_distance_subsequence(ts, i, j, motif_length);
181            distances[[i, j]] = dist;
182            distances[[j, i]] = dist;
183        }
184    }
185
186    let mut motifs = Vec::new();
187    let mut used_indices = vec![false; num_subsequences];
188
189    for _ in 0..max_motifs {
190        let mut min_dist = F::infinity();
191        let mut best_pair = (0, 0);
192
193        // Find the closest pair of unused subsequences
194        for i in 0..num_subsequences {
195            if used_indices[i] {
196                continue;
197            }
198            for j in (i + motif_length)..num_subsequences {
199                if used_indices[j] {
200                    continue;
201                }
202                if distances[[i, j]] < min_dist {
203                    min_dist = distances[[i, j]];
204                    best_pair = (i, j);
205                }
206            }
207        }
208
209        if min_dist.is_infinite() {
210            break;
211        }
212
213        // Find all subsequences similar to this motif pair
214        let threshold = min_dist * F::from(1.5).expect("Failed to convert constant to float");
215        let mut positions = vec![best_pair.0, best_pair.1];
216
217        for k in 0..num_subsequences {
218            if used_indices[k] || k == best_pair.0 || k == best_pair.1 {
219                continue;
220            }
221
222            let dist_to_first = distances[[best_pair.0, k]];
223            let dist_to_second = distances[[best_pair.1, k]];
224
225            if dist_to_first <= threshold || dist_to_second <= threshold {
226                positions.push(k);
227            }
228        }
229
230        // Mark these positions as used
231        for &pos in &positions {
232            for offset in 0..motif_length {
233                if pos + offset < used_indices.len() {
234                    used_indices[pos + offset] = true;
235                }
236            }
237        }
238
239        // Calculate average distance
240        let mut total_dist = F::zero();
241        let mut count = 0;
242        for i in 0..positions.len() {
243            for j in (i + 1)..positions.len() {
244                total_dist = total_dist + distances[[positions[i], positions[j]]];
245                count += 1;
246            }
247        }
248
249        let avg_distance = if count > 0 {
250            total_dist / F::from(count).expect("Failed to convert to float")
251        } else {
252            F::zero()
253        };
254
255        motifs.push(MotifInfo {
256            length: motif_length,
257            frequency: positions.len(),
258            positions,
259            avg_distance,
260        });
261    }
262
263    Ok(motifs)
264}
265
266// =============================================================================
267// Discord Detection
268// =============================================================================
269
270/// Calculate discord scores for anomalous subsequences
271///
272/// Discord scores identify unusual or anomalous patterns in the time series
273/// by measuring how far each subsequence is from its nearest neighbors.
274///
275/// # Arguments
276///
277/// * `ts` - The time series data
278/// * `discord_length` - Length of discord subsequences
279/// * `k_neighbors` - Number of nearest neighbors to consider
280///
281/// # Returns
282///
283/// * Array of discord scores for each position
284#[allow(dead_code)]
285pub fn calculate_discord_scores<F>(
286    ts: &Array1<F>,
287    discord_length: usize,
288    k_neighbors: usize,
289) -> Result<Array1<F>>
290where
291    F: Float + FromPrimitive + Debug + Clone,
292{
293    let n = ts.len();
294    if n < discord_length * 2 {
295        return Err(TimeSeriesError::FeatureExtractionError(
296            "Time series too short for discord detection".to_string(),
297        ));
298    }
299
300    let num_subsequences = n - discord_length + 1;
301    let mut discord_scores = Array1::zeros(num_subsequences);
302
303    for i in 0..num_subsequences {
304        let mut distances = Vec::new();
305
306        // Calculate distances to all other subsequences
307        for j in 0..num_subsequences {
308            if (i as i32 - j as i32).abs() < discord_length as i32 {
309                continue; // Skip overlapping subsequences
310            }
311
312            let dist = euclidean_distance_subsequence(ts, i, j, discord_length);
313            distances.push(dist);
314        }
315
316        // Sort distances and take k nearest _neighbors
317        distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
318
319        if distances.len() >= k_neighbors {
320            // Discord score is the distance to the k-th nearest neighbor
321            discord_scores[i] = distances[k_neighbors - 1];
322        } else if !distances.is_empty() {
323            discord_scores[i] = distances[distances.len() - 1];
324        }
325    }
326
327    Ok(discord_scores)
328}
329
330// =============================================================================
331// SAX (Symbolic Aggregate approXimation)
332// =============================================================================
333
334/// Convert time series to SAX (Symbolic Aggregate approXimation) representation
335///
336/// SAX converts time series data into a symbolic representation using a finite
337/// alphabet, which enables efficient pattern matching and data mining.
338///
339/// # Arguments
340///
341/// * `ts` - The time series data
342/// * `word_length` - Length of SAX words
343/// * `alphabet_size` - Size of the alphabet (number of symbols)
344///
345/// # Returns
346///
347/// * Vector of SAX symbols
348#[allow(dead_code)]
349pub fn time_series_to_sax<F>(
350    ts: &Array1<F>,
351    word_length: usize,
352    alphabet_size: usize,
353) -> Result<Vec<char>>
354where
355    F: Float + FromPrimitive + Debug + Clone,
356{
357    let n = ts.len();
358    if n < word_length {
359        return Err(TimeSeriesError::FeatureExtractionError(
360            "Time series too short for SAX conversion".to_string(),
361        ));
362    }
363
364    if !(2..=26).contains(&alphabet_size) {
365        return Err(TimeSeriesError::FeatureExtractionError(
366            "Alphabet _size must be between 2 and 26".to_string(),
367        ));
368    }
369
370    // Z-normalize the time series
371    let mean = ts.sum() / F::from(n).expect("Failed to convert to float");
372    let variance = ts.mapv(|x| (x - mean) * (x - mean)).sum()
373        / F::from(n).expect("Failed to convert to float");
374    let std_dev = variance.sqrt();
375
376    let normalized = if std_dev > F::zero() {
377        ts.mapv(|x| (x - mean) / std_dev)
378    } else {
379        Array1::zeros(n)
380    };
381
382    // PAA (Piecewise Aggregate Approximation)
383    let segment_size = n / word_length;
384    let mut paa = Array1::zeros(word_length);
385
386    for i in 0..word_length {
387        let start = i * segment_size;
388        let end = if i == word_length - 1 {
389            n
390        } else {
391            (i + 1) * segment_size
392        };
393
394        let segment_sum = normalized.slice(s![start..end]).sum();
395        let segment_len = end - start;
396        paa[i] = segment_sum / F::from(segment_len).expect("Failed to convert to float");
397    }
398
399    // Convert to symbols using Gaussian breakpoints
400    let breakpoints = gaussian_breakpoints(alphabet_size);
401    let mut sax_symbols = Vec::with_capacity(word_length);
402
403    for &value in paa.iter() {
404        let symbol_index = breakpoints
405            .iter()
406            .position(|&bp| value.to_f64().unwrap_or(0.0) <= bp)
407            .unwrap_or(alphabet_size - 1);
408
409        let symbol = (b'a' + symbol_index as u8) as char;
410        sax_symbols.push(symbol);
411    }
412
413    Ok(sax_symbols)
414}
415
416// =============================================================================
417// Shapelet Extraction
418// =============================================================================
419
420/// Extract shapelets (discriminative subsequences) from time series
421///
422/// Shapelets are time series subsequences that are maximally representative
423/// of a class. This function requires labeled data from multiple classes.
424///
425/// # Arguments
426///
427/// * `ts_class1` - Time series from class 1
428/// * `ts_class2` - Time series from class 2  
429/// * `min_length` - Minimum shapelet length
430/// * `max_length` - Maximum shapelet length
431/// * `maxshapelets` - Maximum number of shapelets to return
432///
433/// # Returns
434///
435/// * Vector of discovered shapelets
436#[allow(dead_code)]
437pub fn extractshapelets<F>(
438    ts_class1: &[Array1<F>],
439    ts_class2: &[Array1<F>],
440    min_length: usize,
441    max_length: usize,
442    maxshapelets: usize,
443) -> Result<Vec<ShapeletInfo<F>>>
444where
445    F: Float + FromPrimitive + Debug + Clone,
446{
447    if ts_class1.is_empty() || ts_class2.is_empty() {
448        return Err(TimeSeriesError::FeatureExtractionError(
449            "Need at least one time series from each class".to_string(),
450        ));
451    }
452
453    let mut all_candidates = Vec::new();
454
455    // Generate candidate shapelets from class 1
456    for ts in ts_class1.iter() {
457        for length in min_length..=max_length.min(ts.len() / 2) {
458            for start in 0..=(ts.len() - length) {
459                let shapelet = ts.slice(s![start..start + length]).to_owned();
460
461                // Calculate information gain
462                let info_gain =
463                    calculateshapelet_information_gain(&shapelet, ts_class1, ts_class2)?;
464
465                all_candidates.push(ShapeletInfo {
466                    pattern: shapelet,
467                    position: start,
468                    length,
469                    information_gain: info_gain,
470                });
471            }
472        }
473    }
474
475    // Sort by information gain and take the best ones
476    all_candidates.sort_by(|a, b| {
477        b.information_gain
478            .partial_cmp(&a.information_gain)
479            .unwrap_or(std::cmp::Ordering::Equal)
480    });
481
482    all_candidates.truncate(maxshapelets);
483    Ok(all_candidates)
484}
485
486// =============================================================================
487// Helper Functions
488// =============================================================================
489
490/// Calculate information gain for a shapelet candidate
491#[allow(dead_code)]
492fn calculateshapelet_information_gain<F>(
493    shapelet: &Array1<F>,
494    ts_class1: &[Array1<F>],
495    ts_class2: &[Array1<F>],
496) -> Result<F>
497where
498    F: Float + FromPrimitive + Debug + Clone,
499{
500    let total_count = ts_class1.len() + ts_class2.len();
501    let class1_count = ts_class1.len();
502    let class2_count = ts_class2.len();
503
504    if total_count == 0 {
505        return Ok(F::zero());
506    }
507
508    // Calculate original entropy
509    let p1 = class1_count as f64 / total_count as f64;
510    let p2 = class2_count as f64 / total_count as f64;
511    let original_entropy = if p1 > 0.0 && p2 > 0.0 {
512        -(p1 * p1.ln() + p2 * p2.ln())
513    } else {
514        0.0
515    };
516
517    // Find best threshold by calculating distances to shapelet
518    let mut distances_class1 = Vec::new();
519    let mut distances_class2 = Vec::new();
520
521    for ts in ts_class1 {
522        let min_dist = find_min_distance_toshapelet(ts, shapelet);
523        distances_class1.push(min_dist);
524    }
525
526    for ts in ts_class2 {
527        let min_dist = find_min_distance_toshapelet(ts, shapelet);
528        distances_class2.push(min_dist);
529    }
530
531    // Try different thresholds to find the best split
532    let mut all_distances: Vec<F> = distances_class1
533        .iter()
534        .cloned()
535        .chain(distances_class2.iter().cloned())
536        .collect();
537    all_distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
538
539    let mut best_info_gain = F::zero();
540
541    for &threshold in &all_distances {
542        // Count instances in each split
543        let left_class1 = distances_class1.iter().filter(|&&d| d <= threshold).count();
544        let left_class2 = distances_class2.iter().filter(|&&d| d <= threshold).count();
545        let right_class1 = class1_count - left_class1;
546        let right_class2 = class2_count - left_class2;
547
548        let left_total = left_class1 + left_class2;
549        let right_total = right_class1 + right_class2;
550
551        if left_total == 0 || right_total == 0 {
552            continue;
553        }
554
555        // Calculate weighted entropy for this split
556        let left_entropy = calculate_entropy(left_class1, left_class2);
557        let right_entropy = calculate_entropy(right_class1, right_class2);
558
559        let weighted_entropy = (left_total as f64 / total_count as f64) * left_entropy
560            + (right_total as f64 / total_count as f64) * right_entropy;
561
562        let info_gain = original_entropy - weighted_entropy;
563
564        if F::from(info_gain).expect("Failed to convert to float") > best_info_gain {
565            best_info_gain = F::from(info_gain).expect("Failed to convert to float");
566        }
567    }
568
569    Ok(best_info_gain)
570}
571
572/// Find minimum distance from a time series to a shapelet
573#[allow(dead_code)]
574fn find_min_distance_toshapelet<F>(ts: &Array1<F>, shapelet: &Array1<F>) -> F
575where
576    F: Float + FromPrimitive,
577{
578    let shapelet_len = shapelet.len();
579    let ts_len = ts.len();
580
581    if ts_len < shapelet_len {
582        return F::infinity();
583    }
584
585    let mut min_distance = F::infinity();
586
587    for start in 0..=(ts_len - shapelet_len) {
588        let mut sum = F::zero();
589        for i in 0..shapelet_len {
590            let diff = ts[start + i] - shapelet[i];
591            sum = sum + diff * diff;
592        }
593        let distance = sum.sqrt();
594
595        if distance < min_distance {
596            min_distance = distance;
597        }
598    }
599
600    min_distance
601}
602
603/// Calculate distance matrix for time series subsequences
604#[allow(dead_code)]
605pub fn calculate_distance_matrix<F>(ts: &Array1<F>, subsequence_length: usize) -> Result<Array2<F>>
606where
607    F: Float + FromPrimitive + Debug + Clone,
608{
609    let n = ts.len();
610    if n < subsequence_length {
611        return Err(TimeSeriesError::FeatureExtractionError(
612            "Time series too short for distance matrix calculation".to_string(),
613        ));
614    }
615
616    let num_subsequences = n - subsequence_length + 1;
617    let mut distances = Array2::zeros((num_subsequences, num_subsequences));
618
619    for i in 0..num_subsequences {
620        for j in (i + 1)..num_subsequences {
621            let dist = euclidean_distance_subsequence(ts, i, j, subsequence_length);
622            distances[[i, j]] = dist;
623            distances[[j, i]] = dist;
624        }
625    }
626
627    Ok(distances)
628}
629
630/// Find nearest neighbors for each subsequence
631#[allow(dead_code)]
632pub fn find_nearest_neighbors<F>(distance_matrix: &Array2<F>, k: usize) -> Result<Vec<Vec<usize>>>
633where
634    F: Float + FromPrimitive + Debug + Clone,
635{
636    let n = distance_matrix.nrows();
637    let mut neighbors = Vec::with_capacity(n);
638
639    for i in 0..n {
640        let mut distances_with_indices: Vec<(F, usize)> = (0..n)
641            .filter(|&j| j != i)
642            .map(|j| (distance_matrix[[i, j]], j))
643            .collect();
644
645        distances_with_indices
646            .sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
647
648        let nearest_k: Vec<usize> = distances_with_indices
649            .into_iter()
650            .take(k)
651            .map(|(_, idx)| idx)
652            .collect();
653
654        neighbors.push(nearest_k);
655    }
656
657    Ok(neighbors)
658}
659
660/// Calculate local intrinsic dimensionality for each subsequence
661#[allow(dead_code)]
662pub fn calculate_local_intrinsic_dimensionality<F>(
663    distance_matrix: &Array2<F>,
664    k: usize,
665) -> Result<Vec<F>>
666where
667    F: Float + FromPrimitive + Debug + Clone,
668{
669    let n = distance_matrix.nrows();
670    let mut lid_values = Vec::with_capacity(n);
671
672    for i in 0..n {
673        let mut distances: Vec<F> = (0..n)
674            .filter(|&j| j != i)
675            .map(|j| distance_matrix[[i, j]])
676            .collect();
677
678        distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
679
680        if distances.len() < k || distances[k - 1] == F::zero() {
681            lid_values.push(F::zero());
682            continue;
683        }
684
685        // Calculate LID using the maximum likelihood estimator
686        let mut sum = F::zero();
687        for j in 0..k {
688            if distances[j] > F::zero() && distances[k - 1] > F::zero() {
689                sum = sum + (distances[k - 1] / distances[j]).ln();
690            }
691        }
692
693        let lid = F::from(k).expect("Failed to convert to float") / sum;
694        lid_values.push(lid);
695    }
696
697    Ok(lid_values)
698}