use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
#[derive(Debug, Clone)]
pub struct ShapeletTransformConfig {
pub n_shapelets: usize,
pub n_candidates: usize,
pub min_shapelet_length: usize,
pub max_shapelet_length: usize,
pub random_seed: Option<u64>,
}
impl ShapeletTransformConfig {
pub fn new(n_shapelets: usize) -> Self {
Self {
n_shapelets,
n_candidates: 200,
min_shapelet_length: 3,
max_shapelet_length: 0, random_seed: None,
}
}
}
#[derive(Debug, Clone)]
pub struct Shapelet {
pub data: Vec<f64>,
}
#[derive(Debug, Clone)]
pub struct ShapeletTransformFitted {
pub shapelets: Vec<Shapelet>,
}
pub struct ShapeletTransform;
impl ShapeletTransform {
pub fn fit(
config: &ShapeletTransformConfig,
x: &[Vec<f64>],
y: &[String],
) -> ShapeletTransformFitted {
assert!(!x.is_empty(), "Input must have at least one sample");
assert_eq!(x.len(), y.len(), "X and y must have same length");
let n_timestamps = x[0].len();
let max_len = if config.max_shapelet_length == 0 {
n_timestamps / 2
} else {
config.max_shapelet_length
};
let min_len = config.min_shapelet_length.min(max_len);
assert!(min_len >= 1, "min_shapelet_length must be >= 1");
let mut rng = match config.random_seed {
Some(seed) => StdRng::seed_from_u64(seed),
None => StdRng::from_entropy(),
};
let mut candidates: Vec<Shapelet> = Vec::with_capacity(config.n_candidates);
for _ in 0..config.n_candidates {
let sample_idx = rng.gen_range(0..x.len());
let length = rng.gen_range(min_len..=max_len);
let start = rng.gen_range(0..=n_timestamps - length);
let data = x[sample_idx][start..start + length].to_vec();
let mean = data.iter().sum::<f64>() / data.len() as f64;
let var = data.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / data.len() as f64;
let std = var.sqrt();
let normalized = if std > 1e-10 {
data.iter().map(|v| (v - mean) / std).collect()
} else {
vec![0.0; data.len()]
};
candidates.push(Shapelet { data: normalized });
}
let mut scored: Vec<(f64, usize)> = candidates
.iter()
.enumerate()
.map(|(idx, shapelet)| {
let distances: Vec<f64> = x
.iter()
.map(|sample| min_distance(sample, &shapelet.data))
.collect();
let ig = information_gain(&distances, y);
(ig, idx)
})
.collect();
scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap());
let n_select = config.n_shapelets.min(scored.len());
let shapelets: Vec<Shapelet> = scored[..n_select]
.iter()
.map(|&(_, idx)| candidates[idx].clone())
.collect();
ShapeletTransformFitted { shapelets }
}
pub fn transform(fitted: &ShapeletTransformFitted, x: &[Vec<f64>]) -> Vec<Vec<f64>> {
assert!(!x.is_empty(), "Input must have at least one sample");
x.iter()
.map(|sample| {
fitted
.shapelets
.iter()
.map(|shapelet| min_distance(sample, &shapelet.data))
.collect()
})
.collect()
}
pub fn fit_transform(
config: &ShapeletTransformConfig,
x: &[Vec<f64>],
y: &[String],
) -> Vec<Vec<f64>> {
let fitted = Self::fit(config, x, y);
Self::transform(&fitted, x)
}
}
fn min_distance(ts: &[f64], shapelet: &[f64]) -> f64 {
let n = ts.len();
let m = shapelet.len();
if m > n {
return f64::INFINITY;
}
let mut min_dist = f64::INFINITY;
for i in 0..=n - m {
let subseq = &ts[i..i + m];
let mean = subseq.iter().sum::<f64>() / m as f64;
let var = subseq.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / m as f64;
let std = var.sqrt();
let dist: f64 = if std > 1e-10 {
subseq
.iter()
.zip(shapelet.iter())
.map(|(&s, &sh)| {
let normalized = (s - mean) / std;
(normalized - sh).powi(2)
})
.sum()
} else {
shapelet.iter().map(|&sh| sh.powi(2)).sum()
};
if dist < min_dist {
min_dist = dist;
if min_dist < 1e-15 {
return 0.0;
}
}
}
min_dist
}
fn information_gain(distances: &[f64], y: &[String]) -> f64 {
let n = distances.len() as f64;
if n == 0.0 {
return 0.0;
}
let parent_entropy = entropy_from_labels(y);
let mut indexed: Vec<(f64, &str)> = distances
.iter()
.zip(y.iter())
.map(|(&d, l)| (d, l.as_str()))
.collect();
indexed.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
let mut best_ig = 0.0;
for i in 0..indexed.len() - 1 {
if (indexed[i].0 - indexed[i + 1].0).abs() < 1e-15 {
continue;
}
let left: Vec<&str> = indexed[..=i].iter().map(|&(_, l)| l).collect();
let right: Vec<&str> = indexed[i + 1..].iter().map(|&(_, l)| l).collect();
let left_weight = left.len() as f64 / n;
let right_weight = right.len() as f64 / n;
let ig = parent_entropy
- left_weight * entropy_from_strs(&left)
- right_weight * entropy_from_strs(&right);
if ig > best_ig {
best_ig = ig;
}
}
best_ig
}
fn entropy_from_labels(labels: &[String]) -> f64 {
let strs: Vec<&str> = labels.iter().map(|s| s.as_str()).collect();
entropy_from_strs(&strs)
}
fn entropy_from_strs(labels: &[&str]) -> f64 {
let n = labels.len() as f64;
if n == 0.0 {
return 0.0;
}
let mut counts: std::collections::HashMap<&str, usize> = std::collections::HashMap::new();
for &l in labels {
*counts.entry(l).or_insert(0) += 1;
}
let mut entropy = 0.0;
for &count in counts.values() {
let p = count as f64 / n;
if p > 0.0 {
entropy -= p * p.log2();
}
}
entropy
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_shapelet_transform_basic() {
let config = ShapeletTransformConfig {
n_shapelets: 3,
n_candidates: 20,
random_seed: Some(42),
..ShapeletTransformConfig::new(3)
};
let x = vec![
vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
vec![7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0],
vec![0.0, 2.0, 4.0, 6.0, 4.0, 2.0, 0.0, -2.0],
vec![1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0],
];
let y = vec![
"A".to_string(),
"B".to_string(),
"A".to_string(),
"B".to_string(),
];
let result = ShapeletTransform::fit_transform(&config, &x, &y);
assert_eq!(result.len(), 4);
assert_eq!(result[0].len(), 3);
for row in &result {
for &v in row {
assert!(v >= 0.0, "Distance should be non-negative, got {v}");
}
}
}
#[test]
fn test_shapelet_transform_deterministic() {
let config = ShapeletTransformConfig {
random_seed: Some(123),
n_candidates: 10,
..ShapeletTransformConfig::new(2)
};
let x = vec![
vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
vec![5.0, 4.0, 3.0, 2.0, 1.0, 0.0],
];
let y = vec!["A".to_string(), "B".to_string()];
let r1 = ShapeletTransform::fit_transform(&config, &x, &y);
let r2 = ShapeletTransform::fit_transform(&config, &x, &y);
assert_eq!(r1, r2);
}
#[test]
fn test_min_distance_exact_match() {
let ts = vec![10.0, 20.0, 30.0, 40.0, 50.0];
let mean = 30.0;
let std = (200.0_f64 / 3.0).sqrt();
let shapelet = vec![
(20.0 - mean) / std,
(30.0 - mean) / std,
(40.0 - mean) / std,
];
let dist = min_distance(&ts, &shapelet);
assert!(dist < 1e-8, "Expected ~0 distance, got {dist}");
}
#[test]
fn test_information_gain() {
let distances = vec![0.1, 0.2, 10.0, 11.0];
let labels = vec![
"A".to_string(),
"A".to_string(),
"B".to_string(),
"B".to_string(),
];
let ig = information_gain(&distances, &labels);
assert!((ig - 1.0).abs() < 1e-10, "Expected IG=1.0, got {ig}");
}
#[test]
fn test_entropy() {
let labels = vec!["A".to_string(), "B".to_string()];
let e = entropy_from_labels(&labels);
assert!((e - 1.0).abs() < 1e-10);
let labels = vec!["A".to_string(), "A".to_string()];
let e = entropy_from_labels(&labels);
assert!(e.abs() < 1e-10);
}
}