Skip to main content

file_identify/
tags.rs

1use std::collections::HashSet;
2use std::sync::LazyLock;
3
4use crate::extensions;
5use crate::interpreters;
6
7pub const DIRECTORY: &str = "directory";
8pub const SYMLINK: &str = "symlink";
9pub const SOCKET: &str = "socket";
10pub const FILE: &str = "file";
11pub const EXECUTABLE: &str = "executable";
12pub const NON_EXECUTABLE: &str = "non-executable";
13pub const TEXT: &str = "text";
14pub const BINARY: &str = "binary";
15
16pub type TagSet = HashSet<&'static str>;
17
18/// Helper function to convert a static array of tags to a TagSet.
19#[inline]
20pub fn tags_from_array(tags: &[&'static str]) -> TagSet {
21    tags.iter().copied().collect()
22}
23
24pub static TYPE_TAGS: LazyLock<TagSet> =
25    LazyLock::new(|| HashSet::from([DIRECTORY, FILE, SYMLINK, SOCKET]));
26pub static MODE_TAGS: LazyLock<TagSet> =
27    LazyLock::new(|| HashSet::from([EXECUTABLE, NON_EXECUTABLE]));
28pub static ENCODING_TAGS: LazyLock<TagSet> = LazyLock::new(|| HashSet::from([BINARY, TEXT]));
29
30/// Check if a tag is a file type tag (optimized with pattern matching)
31pub fn is_type_tag(tag: &str) -> bool {
32    matches!(tag, DIRECTORY | FILE | SYMLINK | SOCKET)
33}
34
35/// Check if a tag is a file mode tag (optimized with pattern matching)  
36pub fn is_mode_tag(tag: &str) -> bool {
37    matches!(tag, EXECUTABLE | NON_EXECUTABLE)
38}
39
40/// Check if a tag is an encoding tag (optimized with pattern matching)
41pub fn is_encoding_tag(tag: &str) -> bool {
42    matches!(tag, BINARY | TEXT)
43}
44
45/// `ALL_TAGS = frozenset(_ALL_TAGS)`
46///
47/// The set of every recognized tag — built from type, mode, encoding tags
48/// plus every tag from extensions, names, and interpreters.
49///
50/// Mirrors Python's `identify.identify.ALL_TAGS`.
51pub static ALL_TAGS: LazyLock<TagSet> = LazyLock::new(|| {
52    let mut tags = TagSet::new();
53
54    // _ALL_TAGS = {*TYPE_TAGS, *MODE_TAGS, *ENCODING_TAGS}
55    tags.extend(TYPE_TAGS.iter());
56    tags.extend(MODE_TAGS.iter());
57    tags.extend(ENCODING_TAGS.iter());
58
59    // _ALL_TAGS.update(*extensions.EXTENSIONS.values())
60    for entry in extensions::EXTENSION_TAGS.values() {
61        tags.extend(entry.iter().copied());
62    }
63
64    // _ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
65    for entry in extensions::EXTENSIONS_NEED_BINARY_CHECK_TAGS.values() {
66        tags.extend(entry.iter().copied());
67    }
68
69    // _ALL_TAGS.update(*extensions.NAMES.values())
70    for entry in extensions::NAME_TAGS.values() {
71        tags.extend(entry.iter().copied());
72    }
73
74    // _ALL_TAGS.update(*interpreters.INTERPRETERS.values())
75    for entry in interpreters::INTERPRETER_TAGS.values() {
76        tags.extend(entry.iter().copied());
77    }
78
79    tags
80});
81
82/// The kind of filesystem entry.
83#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
84pub enum FileKind {
85    Regular,
86    Directory,
87    Symlink,
88    Socket,
89}
90
91#[cfg(test)]
92mod tests {
93    use super::*;
94
95    #[test]
96    fn test_all_tags_contains_basics() {
97        // type tags
98        assert!(ALL_TAGS.contains("file"));
99        assert!(ALL_TAGS.contains("directory"));
100        // mode tags
101        assert!(ALL_TAGS.contains("executable"));
102        assert!(ALL_TAGS.contains("non-executable"));
103        // encoding tags
104        assert!(ALL_TAGS.contains("text"));
105        assert!(ALL_TAGS.contains("binary"));
106        // language tags from extensions
107        assert!(ALL_TAGS.contains("python"));
108        assert!(ALL_TAGS.contains("rust"));
109        assert!(ALL_TAGS.contains("javascript"));
110        assert!(ALL_TAGS.contains("json"));
111        // tags from interpreters
112        assert!(ALL_TAGS.contains("shell"));
113        assert!(ALL_TAGS.contains("ruby"));
114        assert!(ALL_TAGS.contains("perl"));
115    }
116
117    #[test]
118    fn test_all_tags_does_not_contain_garbage() {
119        assert!(!ALL_TAGS.contains("notarealtag"));
120    }
121}