use crate::approximation::sax::{sax_transform_symbolic, SaxConfig};
use crate::core::config::{BinStrategy, NumerosityReduction};
use crate::core::traits::Transformer;
use crate::preprocessing::scaler::{StandardScaler, StandardScalerConfig};
#[derive(Debug, Clone)]
pub struct BagOfWordsConfig {
pub window_size: usize,
pub word_size: usize,
pub n_bins: usize,
pub strategy: BinStrategy,
pub numerosity_reduction: NumerosityReduction,
pub window_step: usize,
}
impl BagOfWordsConfig {
pub fn new(window_size: usize, word_size: usize) -> Self {
Self {
window_size,
word_size,
n_bins: 4,
strategy: BinStrategy::Normal,
numerosity_reduction: NumerosityReduction::IdenticalConsecutive,
window_step: 1,
}
}
}
pub struct BagOfWords;
impl BagOfWords {
pub fn transform(config: &BagOfWordsConfig, x: &[Vec<f64>]) -> Vec<String> {
assert!(!x.is_empty(), "Input must have at least one sample");
let n_timestamps = x[0].len();
assert!(
config.window_size <= n_timestamps,
"window_size must not exceed n_timestamps"
);
x.iter().map(|sample| bow_single(sample, config)).collect()
}
}
fn bow_single(sample: &[f64], config: &BagOfWordsConfig) -> String {
let n = sample.len();
let windows: Vec<Vec<f64>> = (0..=n - config.window_size)
.step_by(config.window_step)
.map(|i| sample[i..i + config.window_size].to_vec())
.collect();
let scaler_config = StandardScalerConfig::new();
let scaled = StandardScaler::transform(&scaler_config, &windows);
let sax_config = SaxConfig {
n_bins: config.n_bins,
strategy: config.strategy,
output_size: Some(config.word_size),
};
let symbolic = sax_transform_symbolic(&sax_config, &scaled);
let words: Vec<String> = symbolic
.iter()
.map(|bins| bins.iter().map(|&b| (b'a' + b) as char).collect())
.collect();
let reduced = match config.numerosity_reduction {
NumerosityReduction::IdenticalConsecutive => {
let mut result = Vec::new();
let mut prev = String::new();
for word in words {
if word != prev {
prev.clone_from(&word);
result.push(word);
}
}
result
}
NumerosityReduction::None => words,
};
reduced.join(" ")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bow_basic() {
let config = BagOfWordsConfig::new(4, 2);
let x = vec![vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]];
let result = BagOfWords::transform(&config, &x);
assert_eq!(result.len(), 1);
assert!(!result[0].is_empty());
let words: Vec<&str> = result[0].split_whitespace().collect();
assert!(!words.is_empty());
for word in words {
assert_eq!(word.len(), 2);
}
}
#[test]
fn test_bow_multiple_samples() {
let config = BagOfWordsConfig::new(3, 2);
let x = vec![vec![0.0, 1.0, 2.0, 3.0, 4.0], vec![4.0, 3.0, 2.0, 1.0, 0.0]];
let result = BagOfWords::transform(&config, &x);
assert_eq!(result.len(), 2);
}
}