1use std::collections::HashSet;
10
11#[derive(Debug, Clone)]
30pub struct TagExtractor {
31 keywords: HashSet<String>,
33}
34
35impl Default for TagExtractor {
36 fn default() -> Self {
37 Self::new()
38 }
39}
40
41impl TagExtractor {
42 #[must_use]
49 pub fn new() -> Self {
50 let keywords: HashSet<String> = [
51 "executable",
52 "archive",
53 "image",
54 "video",
55 "audio",
56 "document",
57 "compressed",
58 "encrypted",
59 "text",
60 "binary",
61 "data",
62 "script",
63 "font",
64 "database",
65 "spreadsheet",
66 "presentation",
67 ]
68 .iter()
69 .map(|s| (*s).to_string())
70 .collect();
71
72 Self { keywords }
73 }
74
75 pub fn with_keywords<I, S>(keywords: I) -> Self
91 where
92 I: IntoIterator<Item = S>,
93 S: Into<String>,
94 {
95 let keywords = keywords
96 .into_iter()
97 .map(|s| s.into().to_lowercase())
98 .collect();
99 Self { keywords }
100 }
101
102 #[must_use]
133 pub fn extract_tags(&self, description: &str) -> Vec<String> {
134 let lower = description.to_lowercase();
135
136 let mut tags: Vec<String> = self
137 .keywords
138 .iter()
139 .filter(|keyword| lower.contains(keyword.as_str()))
140 .cloned()
141 .collect();
142
143 tags.sort();
144 tags
145 }
146
147 pub fn extract_rule_path<'a, I>(&self, messages: I) -> Vec<String>
173 where
174 I: IntoIterator<Item = &'a str>,
175 {
176 messages
177 .into_iter()
178 .map(|msg| {
179 msg.to_lowercase()
180 .replace(' ', "-")
181 .chars()
182 .filter(|c| c.is_alphanumeric() || *c == '-')
183 .collect()
184 })
185 .collect()
186 }
187
188 #[must_use]
190 pub fn keyword_count(&self) -> usize {
191 self.keywords.len()
192 }
193}
194
195#[cfg(test)]
196mod tests {
197 use super::*;
198
199 #[test]
200 fn test_new_extractor_has_keywords() {
201 let extractor = TagExtractor::new();
202 assert!(extractor.keyword_count() > 10);
203 }
204
205 #[test]
206 fn test_extract_executable_tag() {
207 let extractor = TagExtractor::new();
208 let tags = extractor.extract_tags("ELF 64-bit executable");
209 assert!(tags.contains(&"executable".to_string()));
210 }
211
212 #[test]
213 fn test_extract_image_tag() {
214 let extractor = TagExtractor::new();
215 let tags = extractor.extract_tags("PNG image data, 800x600");
216 assert!(tags.contains(&"image".to_string()));
217 }
218
219 #[test]
220 fn test_extract_archive_tag() {
221 let extractor = TagExtractor::new();
222 let tags = extractor.extract_tags("Zip archive data");
223 assert!(tags.contains(&"archive".to_string()));
224 }
225
226 #[test]
227 fn test_extract_multiple_tags() {
228 let extractor = TagExtractor::new();
229 let tags = extractor.extract_tags("Zip archive, encrypted and compressed");
230 assert!(tags.contains(&"archive".to_string()));
231 assert!(tags.contains(&"encrypted".to_string()));
232 assert!(tags.contains(&"compressed".to_string()));
233 }
234
235 #[test]
236 fn test_case_insensitive() {
237 let extractor = TagExtractor::new();
238 let tags = extractor.extract_tags("EXECUTABLE file");
239 assert!(tags.contains(&"executable".to_string()));
240 }
241
242 #[test]
243 fn test_no_tags_found() {
244 let extractor = TagExtractor::new();
245 let tags = extractor.extract_tags("unknown format");
246 assert!(tags.is_empty());
247 }
248
249 #[test]
250 fn test_tags_are_sorted() {
251 let extractor = TagExtractor::new();
252 let tags = extractor.extract_tags("compressed archive with encrypted data");
253 assert_eq!(
254 tags,
255 vec![
256 "archive".to_string(),
257 "compressed".to_string(),
258 "data".to_string(),
259 "encrypted".to_string()
260 ]
261 );
262 }
263
264 #[test]
265 fn test_custom_keywords() {
266 let extractor = TagExtractor::with_keywords(vec!["custom", "special"]);
267 let tags = extractor.extract_tags("This is a custom file with special content");
268 assert!(tags.contains(&"custom".to_string()));
269 assert!(tags.contains(&"special".to_string()));
270 assert!(!tags.contains(&"executable".to_string())); }
272
273 #[test]
274 fn test_with_keywords_lowercases_input() {
275 let extractor = TagExtractor::with_keywords(vec!["Executable", "ARCHIVE"]);
277 let tags = extractor.extract_tags("executable file in archive");
279 assert!(tags.contains(&"executable".to_string()));
280 assert!(tags.contains(&"archive".to_string()));
281 }
282
283 #[test]
284 fn test_extract_rule_path() {
285 let extractor = TagExtractor::new();
286 let messages = ["ELF magic", "64-bit LSB", "executable"];
287 let tags = extractor.extract_rule_path(messages.iter().copied());
288 assert_eq!(tags, vec!["elf-magic", "64-bit-lsb", "executable"]);
289 }
290
291 #[test]
292 fn test_extract_rule_path_removes_special_chars() {
293 let extractor = TagExtractor::new();
294 let messages = ["File (version 1.0)", "Data: test!"];
295 let tags = extractor.extract_rule_path(messages.iter().copied());
296 assert_eq!(tags, vec!["file-version-10", "data-test"]);
297 }
298
299 #[test]
300 fn test_default_trait() {
301 let extractor = TagExtractor::default();
302 assert!(extractor.keyword_count() > 0);
303 }
304
305 #[test]
306 fn test_video_tag() {
307 let extractor = TagExtractor::new();
308 let tags = extractor.extract_tags("MPEG video stream");
309 assert!(tags.contains(&"video".to_string()));
310 }
311
312 #[test]
313 fn test_audio_tag() {
314 let extractor = TagExtractor::new();
315 let tags = extractor.extract_tags("FLAC audio bitstream data");
316 assert!(tags.contains(&"audio".to_string()));
317 }
318
319 #[test]
320 fn test_document_tag() {
321 let extractor = TagExtractor::new();
322 let tags = extractor.extract_tags("PDF document, version 1.4");
323 assert!(tags.contains(&"document".to_string()));
324 }
325
326 #[test]
327 fn test_script_tag() {
328 let extractor = TagExtractor::new();
329 let tags = extractor.extract_tags("Python script, ASCII text executable");
330 assert!(tags.contains(&"script".to_string()));
331 assert!(tags.contains(&"text".to_string()));
332 assert!(tags.contains(&"executable".to_string()));
333 }
334}