oxits 0.1.0 - Docs.rs

use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};

/// ShapeletTransform: Extracts discriminative shapelets from time series
/// and transforms data by computing distances to each selected shapelet.
///
/// Pipeline:
/// 1. Extract candidate shapelets (random subsequences from training data)
/// 2. Score each candidate by information gain
/// 3. Select top-k shapelets
/// 4. Transform: compute minimum distance from each sample to each shapelet

#[derive(Debug, Clone)]
pub struct ShapeletTransformConfig {
    pub n_shapelets: usize,
    pub n_candidates: usize,
    pub min_shapelet_length: usize,
    pub max_shapelet_length: usize,
    pub random_seed: Option<u64>,
}

impl ShapeletTransformConfig {
    pub fn new(n_shapelets: usize) -> Self {
        Self {
            n_shapelets,
            n_candidates: 200,
            min_shapelet_length: 3,
            max_shapelet_length: 0, // 0 means auto (n_timestamps / 2)
            random_seed: None,
        }
    }
}

#[derive(Debug, Clone)]
pub struct Shapelet {
    pub data: Vec<f64>,
}

#[derive(Debug, Clone)]
pub struct ShapeletTransformFitted {
    pub shapelets: Vec<Shapelet>,
}

pub struct ShapeletTransform;

impl ShapeletTransform {
    /// Fit: extract and select discriminative shapelets.
    pub fn fit(
        config: &ShapeletTransformConfig,
        x: &[Vec<f64>],
        y: &[String],
    ) -> ShapeletTransformFitted {
        assert!(!x.is_empty(), "Input must have at least one sample");
        assert_eq!(x.len(), y.len(), "X and y must have same length");

        let n_timestamps = x[0].len();
        let max_len = if config.max_shapelet_length == 0 {
            n_timestamps / 2
        } else {
            config.max_shapelet_length
        };
        let min_len = config.min_shapelet_length.min(max_len);

        assert!(min_len >= 1, "min_shapelet_length must be >= 1");

        let mut rng = match config.random_seed {
            Some(seed) => StdRng::seed_from_u64(seed),
            None => StdRng::from_entropy(),
        };

        // Step 1: Extract random candidate shapelets
        let mut candidates: Vec<Shapelet> = Vec::with_capacity(config.n_candidates);
        for _ in 0..config.n_candidates {
            let sample_idx = rng.gen_range(0..x.len());
            let length = rng.gen_range(min_len..=max_len);
            let start = rng.gen_range(0..=n_timestamps - length);
            let data = x[sample_idx][start..start + length].to_vec();

            // Z-normalize the candidate
            let mean = data.iter().sum::<f64>() / data.len() as f64;
            let var = data.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / data.len() as f64;
            let std = var.sqrt();
            let normalized = if std > 1e-10 {
                data.iter().map(|v| (v - mean) / std).collect()
            } else {
                vec![0.0; data.len()]
            };

            candidates.push(Shapelet { data: normalized });
        }

        // Step 2: Score each candidate by information gain
        let mut scored: Vec<(f64, usize)> = candidates
            .iter()
            .enumerate()
            .map(|(idx, shapelet)| {
                let distances: Vec<f64> = x
                    .iter()
                    .map(|sample| min_distance(sample, &shapelet.data))
                    .collect();
                let ig = information_gain(&distances, y);
                (ig, idx)
            })
            .collect();

        // Step 3: Select top-k by information gain
        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap());
        let n_select = config.n_shapelets.min(scored.len());
        let shapelets: Vec<Shapelet> = scored[..n_select]
            .iter()
            .map(|&(_, idx)| candidates[idx].clone())
            .collect();

        ShapeletTransformFitted { shapelets }
    }

    /// Transform: compute distance from each sample to each shapelet.
    ///
    /// Output: (n_samples, n_shapelets)
    pub fn transform(fitted: &ShapeletTransformFitted, x: &[Vec<f64>]) -> Vec<Vec<f64>> {
        assert!(!x.is_empty(), "Input must have at least one sample");

        x.iter()
            .map(|sample| {
                fitted
                    .shapelets
                    .iter()
                    .map(|shapelet| min_distance(sample, &shapelet.data))
                    .collect()
            })
            .collect()
    }

    /// Fit and transform.
    pub fn fit_transform(
        config: &ShapeletTransformConfig,
        x: &[Vec<f64>],
        y: &[String],
    ) -> Vec<Vec<f64>> {
        let fitted = Self::fit(config, x, y);
        Self::transform(&fitted, x)
    }
}

/// Compute the minimum (squared Euclidean) distance between a time series
/// and a shapelet (subsequence matching with z-normalization).
fn min_distance(ts: &[f64], shapelet: &[f64]) -> f64 {
    let n = ts.len();
    let m = shapelet.len();
    if m > n {
        return f64::INFINITY;
    }

    let mut min_dist = f64::INFINITY;

    for i in 0..=n - m {
        let subseq = &ts[i..i + m];

        // Z-normalize the subsequence
        let mean = subseq.iter().sum::<f64>() / m as f64;
        let var = subseq.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / m as f64;
        let std = var.sqrt();

        let dist: f64 = if std > 1e-10 {
            subseq
                .iter()
                .zip(shapelet.iter())
                .map(|(&s, &sh)| {
                    let normalized = (s - mean) / std;
                    (normalized - sh).powi(2)
                })
                .sum()
        } else {
            // Constant subsequence vs shapelet
            shapelet.iter().map(|&sh| sh.powi(2)).sum()
        };

        if dist < min_dist {
            min_dist = dist;
            // Early abandon: can't get better than 0
            if min_dist < 1e-15 {
                return 0.0;
            }
        }
    }

    min_dist
}

/// Compute information gain for a split based on distances and labels.
/// Tries all possible threshold values and returns the best IG.
fn information_gain(distances: &[f64], y: &[String]) -> f64 {
    let n = distances.len() as f64;
    if n == 0.0 {
        return 0.0;
    }

    // Parent entropy
    let parent_entropy = entropy_from_labels(y);

    // Sort by distance
    let mut indexed: Vec<(f64, &str)> = distances
        .iter()
        .zip(y.iter())
        .map(|(&d, l)| (d, l.as_str()))
        .collect();
    indexed.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());

    let mut best_ig = 0.0;

    // Try each possible split point (between consecutive distinct distances)
    for i in 0..indexed.len() - 1 {
        if (indexed[i].0 - indexed[i + 1].0).abs() < 1e-15 {
            continue;
        }

        let left: Vec<&str> = indexed[..=i].iter().map(|&(_, l)| l).collect();
        let right: Vec<&str> = indexed[i + 1..].iter().map(|&(_, l)| l).collect();

        let left_weight = left.len() as f64 / n;
        let right_weight = right.len() as f64 / n;

        let ig = parent_entropy
            - left_weight * entropy_from_strs(&left)
            - right_weight * entropy_from_strs(&right);

        if ig > best_ig {
            best_ig = ig;
        }
    }

    best_ig
}

/// Compute Shannon entropy from a list of labels.
fn entropy_from_labels(labels: &[String]) -> f64 {
    let strs: Vec<&str> = labels.iter().map(|s| s.as_str()).collect();
    entropy_from_strs(&strs)
}

/// Compute Shannon entropy from a list of string references.
fn entropy_from_strs(labels: &[&str]) -> f64 {
    let n = labels.len() as f64;
    if n == 0.0 {
        return 0.0;
    }

    let mut counts: std::collections::HashMap<&str, usize> = std::collections::HashMap::new();
    for &l in labels {
        *counts.entry(l).or_insert(0) += 1;
    }

    let mut entropy = 0.0;
    for &count in counts.values() {
        let p = count as f64 / n;
        if p > 0.0 {
            entropy -= p * p.log2();
        }
    }

    entropy
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_shapelet_transform_basic() {
        let config = ShapeletTransformConfig {
            n_shapelets: 3,
            n_candidates: 20,
            random_seed: Some(42),
            ..ShapeletTransformConfig::new(3)
        };
        let x = vec![
            vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
            vec![7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0],
            vec![0.0, 2.0, 4.0, 6.0, 4.0, 2.0, 0.0, -2.0],
            vec![1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0],
        ];
        let y = vec![
            "A".to_string(),
            "B".to_string(),
            "A".to_string(),
            "B".to_string(),
        ];
        let result = ShapeletTransform::fit_transform(&config, &x, &y);
        assert_eq!(result.len(), 4);
        assert_eq!(result[0].len(), 3);
        // All distances should be non-negative
        for row in &result {
            for &v in row {
                assert!(v >= 0.0, "Distance should be non-negative, got {v}");
            }
        }
    }

    #[test]
    fn test_shapelet_transform_deterministic() {
        let config = ShapeletTransformConfig {
            random_seed: Some(123),
            n_candidates: 10,
            ..ShapeletTransformConfig::new(2)
        };
        let x = vec![
            vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
            vec![5.0, 4.0, 3.0, 2.0, 1.0, 0.0],
        ];
        let y = vec!["A".to_string(), "B".to_string()];
        let r1 = ShapeletTransform::fit_transform(&config, &x, &y);
        let r2 = ShapeletTransform::fit_transform(&config, &x, &y);
        assert_eq!(r1, r2);
    }

    #[test]
    fn test_min_distance_exact_match() {
        // If shapelet matches a z-normalized subsequence, distance should be ~0
        let ts = vec![10.0, 20.0, 30.0, 40.0, 50.0];
        // shapelet is z-normalized version of ts[1..4] = [20, 30, 40]
        // mean=30, std=sqrt(200/3)
        let mean = 30.0;
        let std = (200.0_f64 / 3.0).sqrt();
        let shapelet = vec![
            (20.0 - mean) / std,
            (30.0 - mean) / std,
            (40.0 - mean) / std,
        ];
        let dist = min_distance(&ts, &shapelet);
        assert!(dist < 1e-8, "Expected ~0 distance, got {dist}");
    }

    #[test]
    fn test_information_gain() {
        // Perfect split: all A's have small distance, all B's have large
        let distances = vec![0.1, 0.2, 10.0, 11.0];
        let labels = vec![
            "A".to_string(),
            "A".to_string(),
            "B".to_string(),
            "B".to_string(),
        ];
        let ig = information_gain(&distances, &labels);
        assert!((ig - 1.0).abs() < 1e-10, "Expected IG=1.0, got {ig}");
    }

    #[test]
    fn test_entropy() {
        // Uniform distribution: entropy = log2(2) = 1
        let labels = vec!["A".to_string(), "B".to_string()];
        let e = entropy_from_labels(&labels);
        assert!((e - 1.0).abs() < 1e-10);

        // Pure: entropy = 0
        let labels = vec!["A".to_string(), "A".to_string()];
        let e = entropy_from_labels(&labels);
        assert!(e.abs() < 1e-10);
    }
}