use crate::core::config::NumerosityReduction;
#[derive(Debug, Clone)]
pub struct WordExtractorConfig {
pub window_size: usize,
pub window_step: usize,
pub numerosity_reduction: NumerosityReduction,
}
impl WordExtractorConfig {
pub fn new(window_size: usize) -> Self {
Self {
window_size,
window_step: 1,
numerosity_reduction: NumerosityReduction::IdenticalConsecutive,
}
}
}
pub struct WordExtractor;
impl WordExtractor {
pub fn transform(config: &WordExtractorConfig, x: &[Vec<u8>]) -> Vec<Vec<String>> {
assert!(!x.is_empty(), "Input must have at least one sample");
assert!(config.window_size > 0, "window_size must be positive");
x.iter()
.map(|sample| extract_words_single(sample, config))
.collect()
}
}
fn extract_words_single(sample: &[u8], config: &WordExtractorConfig) -> Vec<String> {
if sample.len() < config.window_size {
return Vec::new();
}
let mut words = Vec::new();
let mut prev_word = String::new();
for start in (0..=sample.len() - config.window_size).step_by(config.window_step) {
let word: String = sample[start..start + config.window_size]
.iter()
.map(|&b| (b'a' + b) as char)
.collect();
match config.numerosity_reduction {
NumerosityReduction::IdenticalConsecutive => {
if word != prev_word {
prev_word.clone_from(&word);
words.push(word);
}
}
NumerosityReduction::None => {
words.push(word);
}
}
}
words
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_word_extraction_basic() {
let config = WordExtractorConfig::new(3);
let x = vec![vec![0, 1, 2, 0, 1, 2]];
let result = WordExtractor::transform(&config, &x);
assert!(!result[0].is_empty());
assert_eq!(result[0][0], "abc");
}
#[test]
fn test_numerosity_reduction() {
let config = WordExtractorConfig::new(2);
let x = vec![vec![0, 0, 0, 1]];
let result = WordExtractor::transform(&config, &x);
assert_eq!(result[0], vec!["aa", "ab"]);
}
#[test]
fn test_no_numerosity_reduction() {
let config = WordExtractorConfig {
window_size: 2,
window_step: 1,
numerosity_reduction: NumerosityReduction::None,
};
let x = vec![vec![0, 0, 0, 1]];
let result = WordExtractor::transform(&config, &x);
assert_eq!(result[0], vec!["aa", "aa", "ab"]);
}
#[test]
fn test_window_step() {
let config = WordExtractorConfig {
window_size: 2,
window_step: 2,
numerosity_reduction: NumerosityReduction::None,
};
let x = vec![vec![0, 1, 2, 3]]; let result = WordExtractor::transform(&config, &x);
assert_eq!(result[0], vec!["ab", "cd"]);
}
}