use std::collections::HashMap;
pub type NGram = Vec<String>;
pub type NGramMap = HashMap<NGram, usize>;
pub fn extract_ngrams(syscalls: &[String], n: usize) -> NGramMap {
let mut ngrams: NGramMap = HashMap::new();
if syscalls.len() < n {
return ngrams; }
for window in syscalls.windows(n) {
let ngram = window.to_vec();
*ngrams.entry(ngram).or_insert(0) += 1;
}
ngrams
}
pub fn ngram_coverage(ngrams: &NGramMap) -> f64 {
if ngrams.is_empty() {
return 0.0;
}
let unique_count = ngrams.len();
let total_count: usize = ngrams.values().sum();
unique_count as f64 / total_count as f64
}
pub fn top_ngrams(ngrams: &NGramMap, k: usize) -> Vec<(NGram, usize)> {
let mut ngram_vec: Vec<_> =
ngrams.iter().map(|(ngram, count)| (ngram.clone(), *count)).collect();
ngram_vec.sort_by(|a, b| b.1.cmp(&a.1));
ngram_vec.into_iter().take(k).collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_ngrams_basic() {
let syscalls =
vec!["mmap".to_string(), "read".to_string(), "write".to_string(), "close".to_string()];
let ngrams = extract_ngrams(&syscalls, 3);
assert_eq!(ngrams.len(), 2);
let ngram1 = vec!["mmap".to_string(), "read".to_string(), "write".to_string()];
let ngram2 = vec!["read".to_string(), "write".to_string(), "close".to_string()];
assert_eq!(ngrams.get(&ngram1), Some(&1));
assert_eq!(ngrams.get(&ngram2), Some(&1));
}
#[test]
fn test_extract_ngrams_repeated() {
let syscalls = vec![
"mmap".to_string(),
"read".to_string(),
"write".to_string(),
"mmap".to_string(),
"read".to_string(),
"write".to_string(),
];
let ngrams = extract_ngrams(&syscalls, 3);
let ngram = vec!["mmap".to_string(), "read".to_string(), "write".to_string()];
assert_eq!(ngrams.get(&ngram), Some(&2));
}
#[test]
fn test_extract_ngrams_insufficient_length() {
let syscalls = vec!["mmap".to_string(), "read".to_string()];
let ngrams = extract_ngrams(&syscalls, 3);
assert_eq!(ngrams.len(), 0);
}
#[test]
fn test_ngram_coverage() {
let mut ngrams = HashMap::new();
ngrams.insert(vec!["a".to_string(), "b".to_string()], 1);
ngrams.insert(vec!["b".to_string(), "c".to_string()], 1);
ngrams.insert(vec!["c".to_string(), "d".to_string()], 1);
assert_eq!(ngram_coverage(&ngrams), 1.0);
}
#[test]
fn test_ngram_coverage_repetitive() {
let mut ngrams = HashMap::new();
ngrams.insert(vec!["a".to_string(), "b".to_string()], 10);
ngrams.insert(vec!["b".to_string(), "c".to_string()], 1);
let coverage = ngram_coverage(&ngrams);
assert!((coverage - 0.181).abs() < 0.01);
}
#[test]
fn test_top_ngrams() {
let mut ngrams = HashMap::new();
ngrams.insert(vec!["a".to_string(), "b".to_string()], 10);
ngrams.insert(vec!["b".to_string(), "c".to_string()], 5);
ngrams.insert(vec!["c".to_string(), "d".to_string()], 1);
let top = top_ngrams(&ngrams, 2);
assert_eq!(top.len(), 2);
assert_eq!(top[0].1, 10); assert_eq!(top[1].1, 5);
}
}