use std::collections::HashSet;
#[derive(Debug, Clone)]
pub struct TagExtractor {
keywords: HashSet<String>,
}
impl Default for TagExtractor {
fn default() -> Self {
Self::new()
}
}
impl TagExtractor {
#[must_use]
pub fn new() -> Self {
let keywords: HashSet<String> = [
"executable",
"archive",
"image",
"video",
"audio",
"document",
"compressed",
"encrypted",
"text",
"binary",
"data",
"script",
"font",
"database",
"spreadsheet",
"presentation",
]
.iter()
.map(|s| (*s).to_string())
.collect();
Self { keywords }
}
pub fn with_keywords<I, S>(keywords: I) -> Self
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
let keywords = keywords
.into_iter()
.map(|s| s.into().to_lowercase())
.collect();
Self { keywords }
}
#[must_use]
pub fn extract_tags(&self, description: &str) -> Vec<String> {
let lower = description.to_lowercase();
let mut tags: Vec<String> = self
.keywords
.iter()
.filter(|keyword| lower.contains(keyword.as_str()))
.cloned()
.collect();
tags.sort();
tags
}
pub fn extract_rule_path<'a, I>(&self, messages: I) -> Vec<String>
where
I: IntoIterator<Item = &'a str>,
{
messages
.into_iter()
.map(|msg| {
msg.to_lowercase()
.replace(' ', "-")
.chars()
.filter(|c| c.is_alphanumeric() || *c == '-')
.collect()
})
.collect()
}
#[must_use]
pub fn keyword_count(&self) -> usize {
self.keywords.len()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_new_extractor_has_keywords() {
let extractor = TagExtractor::new();
assert!(extractor.keyword_count() > 10);
}
#[test]
fn test_extract_executable_tag() {
let extractor = TagExtractor::new();
let tags = extractor.extract_tags("ELF 64-bit executable");
assert!(tags.contains(&"executable".to_string()));
}
#[test]
fn test_extract_image_tag() {
let extractor = TagExtractor::new();
let tags = extractor.extract_tags("PNG image data, 800x600");
assert!(tags.contains(&"image".to_string()));
}
#[test]
fn test_extract_archive_tag() {
let extractor = TagExtractor::new();
let tags = extractor.extract_tags("Zip archive data");
assert!(tags.contains(&"archive".to_string()));
}
#[test]
fn test_extract_multiple_tags() {
let extractor = TagExtractor::new();
let tags = extractor.extract_tags("Zip archive, encrypted and compressed");
assert!(tags.contains(&"archive".to_string()));
assert!(tags.contains(&"encrypted".to_string()));
assert!(tags.contains(&"compressed".to_string()));
}
#[test]
fn test_case_insensitive() {
let extractor = TagExtractor::new();
let tags = extractor.extract_tags("EXECUTABLE file");
assert!(tags.contains(&"executable".to_string()));
}
#[test]
fn test_no_tags_found() {
let extractor = TagExtractor::new();
let tags = extractor.extract_tags("unknown format");
assert!(tags.is_empty());
}
#[test]
fn test_tags_are_sorted() {
let extractor = TagExtractor::new();
let tags = extractor.extract_tags("compressed archive with encrypted data");
assert_eq!(
tags,
vec![
"archive".to_string(),
"compressed".to_string(),
"data".to_string(),
"encrypted".to_string()
]
);
}
#[test]
fn test_custom_keywords() {
let extractor = TagExtractor::with_keywords(vec!["custom", "special"]);
let tags = extractor.extract_tags("This is a custom file with special content");
assert!(tags.contains(&"custom".to_string()));
assert!(tags.contains(&"special".to_string()));
assert!(!tags.contains(&"executable".to_string())); }
#[test]
fn test_with_keywords_lowercases_input() {
let extractor = TagExtractor::with_keywords(vec!["Executable", "ARCHIVE"]);
let tags = extractor.extract_tags("executable file in archive");
assert!(tags.contains(&"executable".to_string()));
assert!(tags.contains(&"archive".to_string()));
}
#[test]
fn test_extract_rule_path() {
let extractor = TagExtractor::new();
let messages = ["ELF magic", "64-bit LSB", "executable"];
let tags = extractor.extract_rule_path(messages.iter().copied());
assert_eq!(tags, vec!["elf-magic", "64-bit-lsb", "executable"]);
}
#[test]
fn test_extract_rule_path_removes_special_chars() {
let extractor = TagExtractor::new();
let messages = ["File (version 1.0)", "Data: test!"];
let tags = extractor.extract_rule_path(messages.iter().copied());
assert_eq!(tags, vec!["file-version-10", "data-test"]);
}
#[test]
fn test_default_trait() {
let extractor = TagExtractor::default();
assert!(extractor.keyword_count() > 0);
}
#[test]
fn test_video_tag() {
let extractor = TagExtractor::new();
let tags = extractor.extract_tags("MPEG video stream");
assert!(tags.contains(&"video".to_string()));
}
#[test]
fn test_audio_tag() {
let extractor = TagExtractor::new();
let tags = extractor.extract_tags("FLAC audio bitstream data");
assert!(tags.contains(&"audio".to_string()));
}
#[test]
fn test_document_tag() {
let extractor = TagExtractor::new();
let tags = extractor.extract_tags("PDF document, version 1.4");
assert!(tags.contains(&"document".to_string()));
}
#[test]
fn test_script_tag() {
let extractor = TagExtractor::new();
let tags = extractor.extract_tags("Python script, ASCII text executable");
assert!(tags.contains(&"script".to_string()));
assert!(tags.contains(&"text".to_string()));
assert!(tags.contains(&"executable".to_string()));
}
}