use crate::approximation::sax::{sax_transform_symbolic, SaxConfig};
use crate::core::config::{BinStrategy, NumerosityReduction, TfIdfVariant};
use crate::core::traits::Transformer;
use crate::preprocessing::scaler::{StandardScaler, StandardScalerConfig};
use crate::utils::tfidf::tfidf_vectorize;
#[derive(Debug, Clone)]
pub struct BagOfPatternsConfig {
pub window_size: usize,
pub word_size: usize,
pub n_bins: usize,
pub strategy: BinStrategy,
pub numerosity_reduction: NumerosityReduction,
pub window_step: usize,
pub variant: TfIdfVariant,
}
impl BagOfPatternsConfig {
pub fn new(window_size: usize, word_size: usize) -> Self {
Self {
window_size,
word_size,
n_bins: 4,
strategy: BinStrategy::Normal,
numerosity_reduction: NumerosityReduction::IdenticalConsecutive,
window_step: 1,
variant: TfIdfVariant::SublinearTfIdf,
}
}
}
#[derive(Debug, Clone)]
pub struct BagOfPatternsOutput {
pub vocabulary: Vec<String>,
pub matrix: Vec<Vec<f64>>,
}
pub struct BagOfPatterns;
impl BagOfPatterns {
pub fn transform(config: &BagOfPatternsConfig, x: &[Vec<f64>]) -> BagOfPatternsOutput {
assert!(!x.is_empty(), "Input must have at least one sample");
let n_timestamps = x[0].len();
assert!(
config.window_size <= n_timestamps,
"window_size must not exceed n_timestamps"
);
assert!(
config.word_size <= config.window_size,
"word_size must not exceed window_size"
);
let corpus: Vec<Vec<String>> = x
.iter()
.map(|sample| extract_words_single(sample, config))
.collect();
let (vocabulary, matrix) = tfidf_vectorize(&corpus, config.variant);
BagOfPatternsOutput { vocabulary, matrix }
}
}
fn extract_words_single(sample: &[f64], config: &BagOfPatternsConfig) -> Vec<String> {
let n = sample.len();
let windows: Vec<Vec<f64>> = (0..=n - config.window_size)
.step_by(config.window_step)
.map(|i| sample[i..i + config.window_size].to_vec())
.collect();
let scaler_config = StandardScalerConfig::new();
let scaled = StandardScaler::transform(&scaler_config, &windows);
let sax_config = SaxConfig {
n_bins: config.n_bins,
strategy: config.strategy,
output_size: Some(config.word_size),
};
let symbolic = sax_transform_symbolic(&sax_config, &scaled);
let words: Vec<String> = symbolic
.iter()
.map(|bins| bins.iter().map(|&b| (b'a' + b) as char).collect())
.collect();
match config.numerosity_reduction {
NumerosityReduction::IdenticalConsecutive => {
let mut result = Vec::new();
let mut prev = String::new();
for word in words {
if word != prev {
prev.clone_from(&word);
result.push(word);
}
}
result
}
NumerosityReduction::None => words,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bag_of_patterns_basic() {
let config = BagOfPatternsConfig::new(4, 2);
let x = vec![
vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
vec![7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0],
];
let output = BagOfPatterns::transform(&config, &x);
assert_eq!(output.matrix.len(), 2);
assert!(!output.vocabulary.is_empty());
for row in &output.matrix {
assert_eq!(row.len(), output.vocabulary.len());
}
}
#[test]
fn test_bag_of_patterns_tf_only() {
let config = BagOfPatternsConfig {
variant: TfIdfVariant::Tf,
..BagOfPatternsConfig::new(3, 2)
};
let x = vec![vec![0.0, 1.0, 2.0, 3.0, 4.0]];
let output = BagOfPatterns::transform(&config, &x);
assert_eq!(output.matrix.len(), 1);
for &v in &output.matrix[0] {
assert!(v >= 0.0);
}
}
#[test]
fn test_bag_of_patterns_word_extraction() {
let words = extract_words_single(
&[0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
&BagOfPatternsConfig::new(3, 2),
);
assert!(!words.is_empty());
for word in &words {
assert_eq!(word.len(), 2);
}
}
}