oxits 0.1.0

Time series classification and transformation library for Rust
Documentation
use crate::approximation::sax::{sax_transform_symbolic, SaxConfig};
use crate::core::config::{BinStrategy, NumerosityReduction};
use crate::core::traits::Transformer;
use crate::preprocessing::scaler::{StandardScaler, StandardScalerConfig};

#[derive(Debug, Clone)]
pub struct BagOfWordsConfig {
    pub window_size: usize,
    pub word_size: usize,
    pub n_bins: usize,
    pub strategy: BinStrategy,
    pub numerosity_reduction: NumerosityReduction,
    pub window_step: usize,
}

impl BagOfWordsConfig {
    pub fn new(window_size: usize, word_size: usize) -> Self {
        Self {
            window_size,
            word_size,
            n_bins: 4,
            strategy: BinStrategy::Normal,
            numerosity_reduction: NumerosityReduction::IdenticalConsecutive,
            window_step: 1,
        }
    }
}

pub struct BagOfWords;

impl BagOfWords {
    /// Transform time series into bag-of-words string representations.
    ///
    /// Pipeline per sample:
    /// 1. Extract sliding windows
    /// 2. Apply SAX to each window → symbolic sequences
    /// 3. Extract words
    /// 4. Join words into space-separated string
    pub fn transform(config: &BagOfWordsConfig, x: &[Vec<f64>]) -> Vec<String> {
        assert!(!x.is_empty(), "Input must have at least one sample");
        let n_timestamps = x[0].len();
        assert!(
            config.window_size <= n_timestamps,
            "window_size must not exceed n_timestamps"
        );

        x.iter().map(|sample| bow_single(sample, config)).collect()
    }
}

fn bow_single(sample: &[f64], config: &BagOfWordsConfig) -> String {
    let n = sample.len();

    // Step 1: Extract sliding windows
    let windows: Vec<Vec<f64>> = (0..=n - config.window_size)
        .step_by(config.window_step)
        .map(|i| sample[i..i + config.window_size].to_vec())
        .collect();

    // Step 2: Standardize each window
    let scaler_config = StandardScalerConfig::new();
    let scaled = StandardScaler::transform(&scaler_config, &windows);

    // Step 3: Apply PAA + discretization to each window → SAX words
    let sax_config = SaxConfig {
        n_bins: config.n_bins,
        strategy: config.strategy,
        output_size: Some(config.word_size),
    };
    let symbolic = sax_transform_symbolic(&sax_config, &scaled);

    // Step 4: Extract words (each window becomes one word)
    let words: Vec<String> = symbolic
        .iter()
        .map(|bins| bins.iter().map(|&b| (b'a' + b) as char).collect())
        .collect();

    // Step 5: Apply numerosity reduction
    let reduced = match config.numerosity_reduction {
        NumerosityReduction::IdenticalConsecutive => {
            let mut result = Vec::new();
            let mut prev = String::new();
            for word in words {
                if word != prev {
                    prev.clone_from(&word);
                    result.push(word);
                }
            }
            result
        }
        NumerosityReduction::None => words,
    };

    reduced.join(" ")
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_bow_basic() {
        let config = BagOfWordsConfig::new(4, 2);
        let x = vec![vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]];
        let result = BagOfWords::transform(&config, &x);
        assert_eq!(result.len(), 1);
        assert!(!result[0].is_empty());
        // Should contain space-separated words
        let words: Vec<&str> = result[0].split_whitespace().collect();
        assert!(!words.is_empty());
        // Each word should have length == word_size
        for word in words {
            assert_eq!(word.len(), 2);
        }
    }

    #[test]
    fn test_bow_multiple_samples() {
        let config = BagOfWordsConfig::new(3, 2);
        let x = vec![vec![0.0, 1.0, 2.0, 3.0, 4.0], vec![4.0, 3.0, 2.0, 1.0, 0.0]];
        let result = BagOfWords::transform(&config, &x);
        assert_eq!(result.len(), 2);
    }
}