oxits 0.1.0

Time series classification and transformation library for Rust
Documentation
use crate::approximation::sax::{sax_transform_symbolic, SaxConfig};
use crate::core::config::{BinStrategy, NumerosityReduction, TfIdfVariant};
use crate::core::traits::Transformer;
use crate::preprocessing::scaler::{StandardScaler, StandardScalerConfig};
use crate::utils::tfidf::tfidf_vectorize;

/// BagOfPatterns: windowing → SAX → word extraction → TF/TF-IDF vectorization.
///
/// Transforms each time series into a fixed-length feature vector by:
/// 1. Extracting sliding windows
/// 2. Standardizing each window
/// 3. Applying SAX to each window
/// 4. Extracting words and applying numerosity reduction
/// 5. Building a TF or TF-IDF representation

#[derive(Debug, Clone)]
pub struct BagOfPatternsConfig {
    pub window_size: usize,
    pub word_size: usize,
    pub n_bins: usize,
    pub strategy: BinStrategy,
    pub numerosity_reduction: NumerosityReduction,
    pub window_step: usize,
    pub variant: TfIdfVariant,
}

impl BagOfPatternsConfig {
    pub fn new(window_size: usize, word_size: usize) -> Self {
        Self {
            window_size,
            word_size,
            n_bins: 4,
            strategy: BinStrategy::Normal,
            numerosity_reduction: NumerosityReduction::IdenticalConsecutive,
            window_step: 1,
            variant: TfIdfVariant::SublinearTfIdf,
        }
    }
}

/// Output of BagOfPatterns: vocabulary + feature matrix.
#[derive(Debug, Clone)]
pub struct BagOfPatternsOutput {
    pub vocabulary: Vec<String>,
    pub matrix: Vec<Vec<f64>>,
}

pub struct BagOfPatterns;

impl BagOfPatterns {
    /// Transform time series into bag-of-patterns feature vectors.
    ///
    /// Returns (vocabulary, feature_matrix) where `feature_matrix[i][j]`
    /// is the TF-IDF weight of `vocabulary[j]` in sample i.
    pub fn transform(config: &BagOfPatternsConfig, x: &[Vec<f64>]) -> BagOfPatternsOutput {
        assert!(!x.is_empty(), "Input must have at least one sample");
        let n_timestamps = x[0].len();
        assert!(
            config.window_size <= n_timestamps,
            "window_size must not exceed n_timestamps"
        );
        assert!(
            config.word_size <= config.window_size,
            "word_size must not exceed window_size"
        );

        // Step 1-4: Extract words for each sample
        let corpus: Vec<Vec<String>> = x
            .iter()
            .map(|sample| extract_words_single(sample, config))
            .collect();

        // Step 5: TF-IDF vectorization
        let (vocabulary, matrix) = tfidf_vectorize(&corpus, config.variant);

        BagOfPatternsOutput { vocabulary, matrix }
    }
}

/// Extract SAX words from a single sample using sliding windows.
fn extract_words_single(sample: &[f64], config: &BagOfPatternsConfig) -> Vec<String> {
    let n = sample.len();

    // Extract sliding windows
    let windows: Vec<Vec<f64>> = (0..=n - config.window_size)
        .step_by(config.window_step)
        .map(|i| sample[i..i + config.window_size].to_vec())
        .collect();

    // Standardize each window
    let scaler_config = StandardScalerConfig::new();
    let scaled = StandardScaler::transform(&scaler_config, &windows);

    // Apply SAX to get symbolic representation
    let sax_config = SaxConfig {
        n_bins: config.n_bins,
        strategy: config.strategy,
        output_size: Some(config.word_size),
    };
    let symbolic = sax_transform_symbolic(&sax_config, &scaled);

    // Convert to words
    let words: Vec<String> = symbolic
        .iter()
        .map(|bins| bins.iter().map(|&b| (b'a' + b) as char).collect())
        .collect();

    // Apply numerosity reduction
    match config.numerosity_reduction {
        NumerosityReduction::IdenticalConsecutive => {
            let mut result = Vec::new();
            let mut prev = String::new();
            for word in words {
                if word != prev {
                    prev.clone_from(&word);
                    result.push(word);
                }
            }
            result
        }
        NumerosityReduction::None => words,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_bag_of_patterns_basic() {
        let config = BagOfPatternsConfig::new(4, 2);
        let x = vec![
            vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
            vec![7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0],
        ];
        let output = BagOfPatterns::transform(&config, &x);
        assert_eq!(output.matrix.len(), 2);
        assert!(!output.vocabulary.is_empty());
        // Each row should have same length as vocabulary
        for row in &output.matrix {
            assert_eq!(row.len(), output.vocabulary.len());
        }
    }

    #[test]
    fn test_bag_of_patterns_tf_only() {
        let config = BagOfPatternsConfig {
            variant: TfIdfVariant::Tf,
            ..BagOfPatternsConfig::new(3, 2)
        };
        let x = vec![vec![0.0, 1.0, 2.0, 3.0, 4.0]];
        let output = BagOfPatterns::transform(&config, &x);
        assert_eq!(output.matrix.len(), 1);
        // TF values should be non-negative integers
        for &v in &output.matrix[0] {
            assert!(v >= 0.0);
        }
    }

    #[test]
    fn test_bag_of_patterns_word_extraction() {
        // Verify word extraction with known patterns
        let words = extract_words_single(
            &[0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
            &BagOfPatternsConfig::new(3, 2),
        );
        assert!(!words.is_empty());
        for word in &words {
            assert_eq!(word.len(), 2);
        }
    }
}