Skip to main content

libmagic_rs/
tags.rs

1// Copyright (c) 2025-2026 the libmagic-rs contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Tag extraction for file type classification
5//!
6//! This module extracts classification tags from file type descriptions
7//! to enable categorization and filtering of detected file types.
8
9use std::collections::HashSet;
10
11/// Tag extractor for file type classification
12///
13/// Extracts classification tags from file descriptions based on
14/// a configurable set of keywords.
15///
16/// # Examples
17///
18/// ```
19/// use libmagic_rs::tags::TagExtractor;
20///
21/// let extractor = TagExtractor::new();
22/// let tags = extractor.extract_tags("ELF 64-bit executable");
23/// assert!(tags.contains(&"executable".to_string()));
24///
25/// let tags = extractor.extract_tags("Zip archive data, compressed");
26/// assert!(tags.contains(&"archive".to_string()));
27/// assert!(tags.contains(&"compressed".to_string()));
28/// ```
29#[derive(Debug, Clone)]
30pub struct TagExtractor {
31    /// Set of keywords to look for in descriptions
32    keywords: HashSet<String>,
33}
34
35impl Default for TagExtractor {
36    fn default() -> Self {
37        Self::new()
38    }
39}
40
41impl TagExtractor {
42    /// Create a new tag extractor with default keywords
43    ///
44    /// Default keywords include common file type classifications:
45    /// - executable, archive, image, video, audio
46    /// - document, compressed, encrypted, text, binary
47    /// - data, script, font, database, spreadsheet
48    #[must_use]
49    pub fn new() -> Self {
50        let keywords: HashSet<String> = [
51            "executable",
52            "archive",
53            "image",
54            "video",
55            "audio",
56            "document",
57            "compressed",
58            "encrypted",
59            "text",
60            "binary",
61            "data",
62            "script",
63            "font",
64            "database",
65            "spreadsheet",
66            "presentation",
67        ]
68        .iter()
69        .map(|s| (*s).to_string())
70        .collect();
71
72        Self { keywords }
73    }
74
75    /// Create a tag extractor with custom keywords
76    ///
77    /// # Arguments
78    ///
79    /// * `keywords` - Iterator of keyword strings to use for tag extraction
80    ///
81    /// # Examples
82    ///
83    /// ```
84    /// use libmagic_rs::tags::TagExtractor;
85    ///
86    /// let extractor = TagExtractor::with_keywords(vec!["custom", "tags"]);
87    /// let tags = extractor.extract_tags("This has custom content");
88    /// assert!(tags.contains(&"custom".to_string()));
89    /// ```
90    pub fn with_keywords<I, S>(keywords: I) -> Self
91    where
92        I: IntoIterator<Item = S>,
93        S: Into<String>,
94    {
95        let keywords = keywords
96            .into_iter()
97            .map(|s| s.into().to_lowercase())
98            .collect();
99        Self { keywords }
100    }
101
102    /// Extract tags from a file description
103    ///
104    /// Performs case-insensitive matching against the keyword set.
105    /// Returns a sorted vector of unique matching tags.
106    ///
107    /// # Arguments
108    ///
109    /// * `description` - The file type description to extract tags from
110    ///
111    /// # Returns
112    ///
113    /// A vector of matching tag strings, sorted alphabetically.
114    ///
115    /// # Examples
116    ///
117    /// ```
118    /// use libmagic_rs::tags::TagExtractor;
119    ///
120    /// let extractor = TagExtractor::new();
121    ///
122    /// // Single tag extraction
123    /// let tags = extractor.extract_tags("PNG image, 800x600");
124    /// assert_eq!(tags, vec!["image".to_string()]);
125    ///
126    /// // Multiple tags
127    /// let tags = extractor.extract_tags("Zip archive, encrypted and compressed");
128    /// assert!(tags.contains(&"archive".to_string()));
129    /// assert!(tags.contains(&"encrypted".to_string()));
130    /// assert!(tags.contains(&"compressed".to_string()));
131    /// ```
132    #[must_use]
133    pub fn extract_tags(&self, description: &str) -> Vec<String> {
134        let lower = description.to_lowercase();
135
136        let mut tags: Vec<String> = self
137            .keywords
138            .iter()
139            .filter(|keyword| lower.contains(keyword.as_str()))
140            .cloned()
141            .collect();
142
143        tags.sort();
144        tags
145    }
146
147    /// Extract rule path tags from match messages
148    ///
149    /// Normalizes match messages into tag-like identifiers by:
150    /// - Converting to lowercase
151    /// - Replacing spaces with hyphens
152    /// - Removing special characters
153    ///
154    /// # Arguments
155    ///
156    /// * `messages` - Iterator of match messages to convert
157    ///
158    /// # Returns
159    ///
160    /// A vector of normalized tag strings.
161    ///
162    /// # Examples
163    ///
164    /// ```
165    /// use libmagic_rs::tags::TagExtractor;
166    ///
167    /// let extractor = TagExtractor::new();
168    /// let messages = vec!["ELF magic", "64-bit LSB", "executable"];
169    /// let tags = extractor.extract_rule_path(messages.iter().map(|s| *s));
170    /// assert_eq!(tags, vec!["elf-magic", "64-bit-lsb", "executable"]);
171    /// ```
172    pub fn extract_rule_path<'a, I>(&self, messages: I) -> Vec<String>
173    where
174        I: IntoIterator<Item = &'a str>,
175    {
176        messages
177            .into_iter()
178            .map(|msg| {
179                msg.to_lowercase()
180                    .replace(' ', "-")
181                    .chars()
182                    .filter(|c| c.is_alphanumeric() || *c == '-')
183                    .collect()
184            })
185            .collect()
186    }
187
188    /// Get the number of configured keywords
189    #[must_use]
190    pub fn keyword_count(&self) -> usize {
191        self.keywords.len()
192    }
193}
194
195#[cfg(test)]
196mod tests {
197    use super::*;
198
199    #[test]
200    fn test_new_extractor_has_keywords() {
201        let extractor = TagExtractor::new();
202        assert!(extractor.keyword_count() > 10);
203    }
204
205    #[test]
206    fn test_extract_executable_tag() {
207        let extractor = TagExtractor::new();
208        let tags = extractor.extract_tags("ELF 64-bit executable");
209        assert!(tags.contains(&"executable".to_string()));
210    }
211
212    #[test]
213    fn test_extract_image_tag() {
214        let extractor = TagExtractor::new();
215        let tags = extractor.extract_tags("PNG image data, 800x600");
216        assert!(tags.contains(&"image".to_string()));
217    }
218
219    #[test]
220    fn test_extract_archive_tag() {
221        let extractor = TagExtractor::new();
222        let tags = extractor.extract_tags("Zip archive data");
223        assert!(tags.contains(&"archive".to_string()));
224    }
225
226    #[test]
227    fn test_extract_multiple_tags() {
228        let extractor = TagExtractor::new();
229        let tags = extractor.extract_tags("Zip archive, encrypted and compressed");
230        assert!(tags.contains(&"archive".to_string()));
231        assert!(tags.contains(&"encrypted".to_string()));
232        assert!(tags.contains(&"compressed".to_string()));
233    }
234
235    #[test]
236    fn test_case_insensitive() {
237        let extractor = TagExtractor::new();
238        let tags = extractor.extract_tags("EXECUTABLE file");
239        assert!(tags.contains(&"executable".to_string()));
240    }
241
242    #[test]
243    fn test_no_tags_found() {
244        let extractor = TagExtractor::new();
245        let tags = extractor.extract_tags("unknown format");
246        assert!(tags.is_empty());
247    }
248
249    #[test]
250    fn test_tags_are_sorted() {
251        let extractor = TagExtractor::new();
252        let tags = extractor.extract_tags("compressed archive with encrypted data");
253        assert_eq!(
254            tags,
255            vec![
256                "archive".to_string(),
257                "compressed".to_string(),
258                "data".to_string(),
259                "encrypted".to_string()
260            ]
261        );
262    }
263
264    #[test]
265    fn test_custom_keywords() {
266        let extractor = TagExtractor::with_keywords(vec!["custom", "special"]);
267        let tags = extractor.extract_tags("This is a custom file with special content");
268        assert!(tags.contains(&"custom".to_string()));
269        assert!(tags.contains(&"special".to_string()));
270        assert!(!tags.contains(&"executable".to_string())); // Not in custom set
271    }
272
273    #[test]
274    fn test_with_keywords_lowercases_input() {
275        // Keywords should be lowercased for case-insensitive matching
276        let extractor = TagExtractor::with_keywords(vec!["Executable", "ARCHIVE"]);
277        // Should match lowercase version in description
278        let tags = extractor.extract_tags("executable file in archive");
279        assert!(tags.contains(&"executable".to_string()));
280        assert!(tags.contains(&"archive".to_string()));
281    }
282
283    #[test]
284    fn test_extract_rule_path() {
285        let extractor = TagExtractor::new();
286        let messages = ["ELF magic", "64-bit LSB", "executable"];
287        let tags = extractor.extract_rule_path(messages.iter().copied());
288        assert_eq!(tags, vec!["elf-magic", "64-bit-lsb", "executable"]);
289    }
290
291    #[test]
292    fn test_extract_rule_path_removes_special_chars() {
293        let extractor = TagExtractor::new();
294        let messages = ["File (version 1.0)", "Data: test!"];
295        let tags = extractor.extract_rule_path(messages.iter().copied());
296        assert_eq!(tags, vec!["file-version-10", "data-test"]);
297    }
298
299    #[test]
300    fn test_default_trait() {
301        let extractor = TagExtractor::default();
302        assert!(extractor.keyword_count() > 0);
303    }
304
305    #[test]
306    fn test_video_tag() {
307        let extractor = TagExtractor::new();
308        let tags = extractor.extract_tags("MPEG video stream");
309        assert!(tags.contains(&"video".to_string()));
310    }
311
312    #[test]
313    fn test_audio_tag() {
314        let extractor = TagExtractor::new();
315        let tags = extractor.extract_tags("FLAC audio bitstream data");
316        assert!(tags.contains(&"audio".to_string()));
317    }
318
319    #[test]
320    fn test_document_tag() {
321        let extractor = TagExtractor::new();
322        let tags = extractor.extract_tags("PDF document, version 1.4");
323        assert!(tags.contains(&"document".to_string()));
324    }
325
326    #[test]
327    fn test_script_tag() {
328        let extractor = TagExtractor::new();
329        let tags = extractor.extract_tags("Python script, ASCII text executable");
330        assert!(tags.contains(&"script".to_string()));
331        assert!(tags.contains(&"text".to_string()));
332        assert!(tags.contains(&"executable".to_string()));
333    }
334}